Описание в doc/04_statement-linreg.pdf

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.feature_extraction import DictVectorizer
from scipy.sparse import hstack

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
def save_answer(name, value):
    with open('data/statement-linreg_' + name + '.txt', 'w', encoding='utf-8') as f:
        f.write(value)

In [3]:
train = pd.read_csv('data/salary-train.csv')
test = pd.read_csv('data/salary-test-mini.csv')

In [4]:
train.head()

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,International Sales Manager London ****k ****...,London,permanent,33000
1,An ideal opportunity for an individual that ha...,London,permanent,50000
2,Online Content and Brand Manager// Luxury Reta...,South East London,permanent,40000
3,A great local marketleader is seeking a perman...,Dereham,permanent,22500
4,Registered Nurse / RGN Nursing Home for Young...,Sutton Coldfield,,20355


In [5]:
test.head()

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,We currently have a vacancy for an HR Project ...,Milton Keynes,contract,
1,A Web developer opportunity has arisen with an...,Manchester,permanent,


In [6]:
train.ContractTime.unique()

array(['permanent', nan, 'contract'], dtype=object)

In [7]:
train.LocationNormalized.nunique(), train.LocationNormalized.str.lower().nunique()

(1763, 1763)

In [8]:
train['FullDescription'] = train['FullDescription'].str.lower()
train['LocationNormalized'] = train['LocationNormalized'].str.lower()
train['ContractTime'] = train['ContractTime'].str.lower()

test['FullDescription'] = test['FullDescription'].str.lower()
test['LocationNormalized'] = test['LocationNormalized'].str.lower()
test['ContractTime'] = test['ContractTime'].str.lower()

In [9]:
train.head()

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,international sales manager london ****k ****...,london,permanent,33000
1,an ideal opportunity for an individual that ha...,london,permanent,50000
2,online content and brand manager// luxury reta...,south east london,permanent,40000
3,a great local marketleader is seeking a perman...,dereham,permanent,22500
4,registered nurse / rgn nursing home for young...,sutton coldfield,,20355


In [10]:
train['FullDescription'] = train['FullDescription'].replace('[^a-zA-Z0-9]', ' ', regex = True)
test['FullDescription'] = test['FullDescription'].replace('[^a-zA-Z0-9]', ' ', regex = True)

In [11]:
train.head()

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,international sales manager london k ...,london,permanent,33000
1,an ideal opportunity for an individual that ha...,london,permanent,50000
2,online content and brand manager luxury reta...,south east london,permanent,40000
3,a great local marketleader is seeking a perman...,dereham,permanent,22500
4,registered nurse rgn nursing home for young...,sutton coldfield,,20355


In [12]:
train.shape

(60000, 4)

In [13]:
tfidf = TfidfVectorizer(min_df = 5)
text_train = tfidf.fit_transform(train.FullDescription)

text_test = tfidf.transform(test.FullDescription)

In [14]:
text_train.shape

(60000, 22861)

In [15]:
train['LocationNormalized'].fillna('nan', inplace = True)
train['ContractTime'].fillna('nan', inplace = True)

test['LocationNormalized'].fillna('nan', inplace = True)
test['ContractTime'].fillna('nan', inplace = True)

In [16]:
enc = DictVectorizer()
categ_train = enc.fit_transform(train[['LocationNormalized', 'ContractTime']].to_dict('records'))

categ_test = enc.transform(test[['LocationNormalized', 'ContractTime']].to_dict('records'))

In [17]:
categ_train.shape

(60000, 1766)

In [18]:
X_train = hstack([text_train, categ_train])
y = train.SalaryNormalized

X_test = hstack([text_test, categ_test])

In [19]:
estimator = Ridge(alpha = 1, random_state = 241)
estimator.fit(X_train, y)

Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=241, solver='auto', tol=0.001)

In [20]:
answer = estimator.predict(X_test)
answer = ' '.join([str(round(el, 2)) for el in answer])

answer

'56555.62 37188.32'

In [21]:
save_answer('1', answer)