In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.feature_extraction import DictVectorizer

In [2]:
data_train = pd.read_csv("salary-train.csv.zip")
data_train.head()

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,International Sales Manager London ****k ****...,London,permanent,33000
1,An ideal opportunity for an individual that ha...,London,permanent,50000
2,Online Content and Brand Manager// Luxury Reta...,South East London,permanent,40000
3,A great local marketleader is seeking a perman...,Dereham,permanent,22500
4,Registered Nurse / RGN Nursing Home for Young...,Sutton Coldfield,,20355


In [3]:
data_test = pd.read_csv("salary-test-mini.csv")
data_test.head()

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,We currently have a vacancy for an HR Project ...,Milton Keynes,contract,
1,A Web developer opportunity has arisen with an...,Manchester,permanent,


In [4]:
data_train['LocationNormalized'].fillna('nan', inplace=True)
data_train['ContractTime'].fillna('nan', inplace=True)
data_train['FullDescription'].replace('[^a-zA-Z0-9]', ' ', inplace = True, regex = True)
data_train['FullDescription'] = data_train['FullDescription'].str.lower()
data_train['LocationNormalized'] = data_train['LocationNormalized'].str.lower()
data_train['ContractTime'] = data_train['ContractTime'].str.lower()
data_train.head()

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,international sales manager london k ...,london,permanent,33000
1,an ideal opportunity for an individual that ha...,london,permanent,50000
2,online content and brand manager luxury reta...,south east london,permanent,40000
3,a great local marketleader is seeking a perman...,dereham,permanent,22500
4,registered nurse rgn nursing home for young...,sutton coldfield,,20355


In [5]:
enc = DictVectorizer()
X_train_categ = enc.fit_transform(data_train[['LocationNormalized', 'ContractTime']].to_dict('records'))

In [6]:
vectorizer = TfidfVectorizer(min_df=5)
X_description = vectorizer.fit_transform(data_train['FullDescription'])

In [7]:
from scipy.sparse import hstack
X = hstack([X_train_categ, X_description])
y = data_train['SalaryNormalized']

In [8]:
clf = Ridge(alpha = 1, random_state = 241)
clf.fit(X, y)

Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=241, solver='auto', tol=0.001)

In [9]:
data_test['FullDescription'].replace('[^a-zA-Z0-9]', ' ', inplace = True, regex = True)
data_test['FullDescription'] = data_test['FullDescription'].str.lower()
data_test['LocationNormalized'] = data_test['LocationNormalized'].str.lower()
data_test['ContractTime'] = data_test['ContractTime'].str.lower()
data_test.head()

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,we currently have a vacancy for an hr project ...,milton keynes,contract,
1,a web developer opportunity has arisen with an...,manchester,permanent,


In [10]:
X_test_categ = enc.transform(data_test[['LocationNormalized', 'ContractTime']].to_dict('records'))
X_description_test = vectorizer.transform(data_test['FullDescription'])
X_test = hstack([X_test_categ, X_description_test])
clf.predict(X_test)

array([ 56555.61500155,  37188.32442618])