In [3]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.preprocessing import scale, StandardScaler
from sklearn.datasets import load_boston
from sklearn.linear_model import Perceptron, Ridge
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, recall_score, precision_score,precision_recall_curve,confusion_matrix
from sklearn.svm import SVC
from sklearn import datasets
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from scipy.sparse import hstack

In [4]:
salary_train = pd.read_csv('salary-train.csv')

salary_train

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,International Sales Manager London ****k ****...,London,permanent,33000
1,An ideal opportunity for an individual that ha...,London,permanent,50000
2,Online Content and Brand Manager// Luxury Reta...,South East London,permanent,40000
3,A great local marketleader is seeking a perman...,Dereham,permanent,22500
4,Registered Nurse / RGN Nursing Home for Young...,Sutton Coldfield,,20355
5,Sales and Marketing Assistant will provide adm...,Crawley,,22500
6,Vacancy Ladieswear fashion Area Manager / Regi...,UK,permanent,32000
7,Reference: LR/JAN/**** Our client is one of th...,Bristol,permanent,30000
8,Sponsorship Manager London The Company A marke...,Central London,permanent,31500
9,"About Barclays Barclays moves, lends, invests ...",South East London,permanent,42499


In [5]:
salary_test_mini = pd.read_csv('salary-test-mini.csv')

salary_test_mini

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,We currently have a vacancy for an HR Project ...,Milton Keynes,contract,
1,A Web developer opportunity has arisen with an...,Manchester,permanent,


In [6]:
salary_train['FullDescription'] = salary_train['FullDescription'].str.lower().replace('[^a-zA-Z0-9]', ' ', regex = True)

salary_train['LocationNormalized'] = salary_train['LocationNormalized'].str.lower().replace('[^a-zA-Z0-9]', ' ', regex = True)

salary_train['ContractTime'] = salary_train['ContractTime'].str.lower().replace('[^a-zA-Z0-9]', ' ', regex = True)

salary_train['LocationNormalized'].fillna('nan', inplace=True)

salary_train['ContractTime'].fillna('nan', inplace=True)

salary_train

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,international sales manager london k ...,london,permanent,33000
1,an ideal opportunity for an individual that ha...,london,permanent,50000
2,online content and brand manager luxury reta...,south east london,permanent,40000
3,a great local marketleader is seeking a perman...,dereham,permanent,22500
4,registered nurse rgn nursing home for young...,sutton coldfield,,20355
5,sales and marketing assistant will provide adm...,crawley,,22500
6,vacancy ladieswear fashion area manager regi...,uk,permanent,32000
7,reference lr jan our client is one of th...,bristol,permanent,30000
8,sponsorship manager london the company a marke...,central london,permanent,31500
9,about barclays barclays moves lends invests ...,south east london,permanent,42499


In [7]:
vectorizer = TfidfVectorizer(min_df = 5)

X1 = vectorizer.fit_transform(salary_train['FullDescription'])

X1

<60000x22861 sparse matrix of type '<class 'numpy.float64'>'
	with 8365759 stored elements in Compressed Sparse Row format>

In [8]:
enc = DictVectorizer()

X2 = enc.fit_transform(salary_train[['LocationNormalized', 'ContractTime']].to_dict('records'))

X2

<60000x1766 sparse matrix of type '<class 'numpy.float64'>'
	with 120000 stored elements in Compressed Sparse Row format>

In [9]:
X = hstack([X1,X2])

X

<60000x24627 sparse matrix of type '<class 'numpy.float64'>'
	with 8485759 stored elements in COOrdinate format>

In [10]:
Y = salary_train['SalaryNormalized']

Y

0        33000
1        50000
2        40000
3        22500
4        20355
5        22500
6        32000
7        30000
8        31500
9        42499
10       16000
11       47500
12       20000
13       34000
14       15360
15       39811
16       23000
17       47500
18       24500
19       32000
20       45000
21       20000
22       21000
23       34800
24       24500
25       25000
26       28000
27       27500
28       18000
29       40000
         ...  
59970    42500
59971    20000
59972    23750
59973    30500
59974    20500
59975    27682
59976    12960
59977    50000
59978    22000
59979    19500
59980    32640
59981    45000
59982    68750
59983    18720
59984    50000
59985    20000
59986    54000
59987    28000
59988    21000
59989    30000
59990    42000
59991    37000
59992    25920
59993    13440
59994    12585
59995    26400
59996    26000
59997    24500
59998    65000
59999    23040
Name: SalaryNormalized, Length: 60000, dtype: int64

In [11]:
clf = Ridge(alpha=1, random_state=241)

clf.fit(X, Y)

Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=241, solver='auto', tol=0.001)

In [12]:
Ypred = clf.predict(X)

Ypred

array([40202.25725568, 41347.93325162, 34073.22381711, ...,
       24351.12586784, 73125.59808392, 17744.40382246])

In [24]:
salary_test_mini['FullDescription'] = salary_test_mini['FullDescription'].str.lower().replace('[^a-zA-Z0-9]', ' ', regex = True)

salary_test_mini['LocationNormalized'] = salary_test_mini['LocationNormalized'].str.lower().replace('[^a-zA-Z0-9]', ' ', regex = True)

salary_test_mini['ContractTime'] = salary_test_mini['ContractTime'].str.lower().replace('[^a-zA-Z0-9]', ' ', regex = True)

salary_test_mini['LocationNormalized'].fillna('nan', inplace=True)

salary_test_mini['ContractTime'].fillna('nan', inplace=True)

salary_test_mini

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,we currently have a vacancy for an hr project ...,milton keynes,contract,
1,a web developer opportunity has arisen with an...,manchester,permanent,


In [26]:
X1test = vectorizer.transform(salary_test_mini['FullDescription'])

X1test

<2x22861 sparse matrix of type '<class 'numpy.float64'>'
	with 300 stored elements in Compressed Sparse Row format>

In [31]:
X2test = enc.transform(salary_test_mini[['LocationNormalized', 'ContractTime']].to_dict('records'))

X2test

<2x1766 sparse matrix of type '<class 'numpy.float64'>'
	with 4 stored elements in Compressed Sparse Row format>

In [32]:
Xtest = hstack([X1test,X2test])

Xtest

<2x24627 sparse matrix of type '<class 'numpy.float64'>'
	with 304 stored elements in COOrdinate format>

In [35]:
Ypredtest = clf.predict(Xtest)

np.round(Ypredtest,2)

array([56555.62, 37188.32])