In [45]:
import pandas as pd
import scipy
from sklearn import feature_extraction, linear_model

In [46]:
salary_train = pd.read_csv('salary-train.csv')
salary_test = pd.read_csv('salary-test-mini.csv')

In [47]:
print salary_train.shape
print salary_train.info()
salary_train.head()

(60000, 4)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 60000 entries, 0 to 59999
Data columns (total 4 columns):
FullDescription       60000 non-null object
LocationNormalized    60000 non-null object
ContractTime          44418 non-null object
SalaryNormalized      60000 non-null int64
dtypes: int64(1), object(3)
memory usage: 2.3+ MB
None


Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,International Sales Manager London ****k ****...,London,permanent,33000
1,An ideal opportunity for an individual that ha...,London,permanent,50000
2,Online Content and Brand Manager// Luxury Reta...,South East London,permanent,40000
3,A great local marketleader is seeking a perman...,Dereham,permanent,22500
4,Registered Nurse / RGN Nursing Home for Young...,Sutton Coldfield,,20355


Preprocessing

In [48]:
# Text -> lower()
def str_lower(x):
    if type(x) == str:
        return x.lower()
    return x

salary_train = salary_train.applymap(str_lower)
salary_test = salary_test.applymap(str_lower)
salary_train.head()

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,international sales manager london ****k ****...,london,permanent,33000
1,an ideal opportunity for an individual that ha...,london,permanent,50000
2,online content and brand manager// luxury reta...,south east london,permanent,40000
3,a great local marketleader is seeking a perman...,dereham,permanent,22500
4,registered nurse / rgn nursing home for young...,sutton coldfield,,20355


In [49]:
text_columns = ['FullDescription', 'LocationNormalized', 'ContractTime']
for c in text_columns:
    salary_train[c] = salary_train[c].replace('[^a-zA-Z0-9]',' ', regex = True)
    salary_test[c] = salary_train[c].replace('[^a-zA-Z0-9]',' ', regex = True)
salary_train.head()

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,international sales manager london k ...,london,permanent,33000
1,an ideal opportunity for an individual that ha...,london,permanent,50000
2,online content and brand manager luxury reta...,south east london,permanent,40000
3,a great local marketleader is seeking a perman...,dereham,permanent,22500
4,registered nurse rgn nursing home for young...,sutton coldfield,,20355


In [50]:
#Tf-Idf
vec = feature_extraction.text.TfidfVectorizer(min_df = 5)
train_desc = vec.fit_transform(salary_train['FullDescription'])
test_desc = vec.transform(salary_test['FullDescription'])
train_desc

<60000x22861 sparse matrix of type '<type 'numpy.float64'>'
	with 8365759 stored elements in Compressed Sparse Row format>

In [53]:
# Nan
cat_columns = ['LocationNormalized', 'ContractTime']
for col in cat_columns:
    salary_train[col].fillna('nan', inplace = True)
    salary_test[col].fillna('nan', inplace = True)
salary_test

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,international sales manager london k ...,london,permanent,
1,an ideal opportunity for an individual that ha...,london,permanent,


In [54]:
# One-hot
encoder = feature_extraction.DictVectorizer()
X_train_cat = encoder.fit_transform(salary_train[cat_columns].to_dict('records'))
X_test_cat = encoder.transform(salary_test[cat_columns].to_dict('records'))

In [56]:
new_salary_train = scipy.sparse.hstack([train_desc, X_train_cat])
new_salary_test = scipy.sparse.hstack([test_desc, X_test_cat])

<60000x24627 sparse matrix of type '<type 'numpy.float64'>'
	with 8485759 stored elements in COOrdinate format>

In [57]:
y_train = salary_train['SalaryNormalized']
reg = linear_model.Ridge(alpha = 1)
reg.fit(new_salary_train, y_train)
test_pred = reg.predict(new_salary_test)
print test_pred

(60000,)
[ 40213.64982026  41300.50794657]
