In [20]:
import pandas as pd
import sklearn
import sklearn.feature_extraction.text
import sklearn.model_selection
import sklearn.linear_model
import scipy

In [2]:
salary_train = pd.read_csv("salary-train.csv")

In [3]:
salary_test = pd.read_csv("salary-test-mini.csv")

In [4]:
salary_train.head()

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,International Sales Manager London ****k ****...,London,permanent,33000
1,An ideal opportunity for an individual that ha...,London,permanent,50000
2,Online Content and Brand Manager// Luxury Reta...,South East London,permanent,40000
3,A great local marketleader is seeking a perman...,Dereham,permanent,22500
4,Registered Nurse / RGN Nursing Home for Young...,Sutton Coldfield,,20355


In [5]:
salary_test

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,We currently have a vacancy for an HR Project ...,Milton Keynes,contract,
1,A Web developer opportunity has arisen with an...,Manchester,permanent,


In [6]:
def foo(s):
    return s.lower()

salary_train.FullDescription = salary_train.FullDescription.apply(foo)

In [7]:
salary_train.head()

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,international sales manager london ****k ****...,London,permanent,33000
1,an ideal opportunity for an individual that ha...,London,permanent,50000
2,online content and brand manager// luxury reta...,South East London,permanent,40000
3,a great local marketleader is seeking a perman...,Dereham,permanent,22500
4,registered nurse / rgn nursing home for young...,Sutton Coldfield,,20355


In [8]:
salary_train['FullDescription'] = salary_train['FullDescription'].replace('[^a-zA-Z0-9]', ' ', regex = True)

In [9]:
salary_train.head()

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,international sales manager london k ...,London,permanent,33000
1,an ideal opportunity for an individual that ha...,London,permanent,50000
2,online content and brand manager luxury reta...,South East London,permanent,40000
3,a great local marketleader is seeking a perman...,Dereham,permanent,22500
4,registered nurse rgn nursing home for young...,Sutton Coldfield,,20355


In [10]:
salary_train['LocationNormalized'].fillna('nan', inplace=True)
salary_train['ContractTime'].fillna('nan', inplace=True)

In [11]:
vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(min_df=5)

In [12]:
vectorizer

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=5,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [13]:
X_FullDescription_Vec = vectorizer.fit_transform(salary_train['FullDescription'])

In [14]:
X_FullDescription_Vec

<60000x22861 sparse matrix of type '<class 'numpy.float64'>'
	with 8365759 stored elements in Compressed Sparse Row format>

In [15]:
enc = sklearn.feature_extraction.DictVectorizer()

In [16]:
enc

DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
        sparse=True)

In [17]:
X_train_categ = enc.fit_transform(salary_train[['LocationNormalized', 'ContractTime']].to_dict('records'))

In [18]:
X_train_categ

<60000x1766 sparse matrix of type '<class 'numpy.float64'>'
	with 120000 stored elements in Compressed Sparse Row format>

In [19]:
X_train = scipy.sparse.hstack([X_FullDescription_Vec, X_train_categ])

In [21]:
model = sklearn.linear_model.Ridge(alpha=1, random_state=241)

In [23]:
model.fit(X_train, salary_train['SalaryNormalized'])

Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=241, solver='auto', tol=0.001)

In [24]:
salary_test.FullDescription = salary_test.FullDescription.apply(foo)

In [27]:
salary_test.head()

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,we currently have a vacancy for an hr project ...,Milton Keynes,contract,
1,a web developer opportunity has arisen with an...,Manchester,permanent,


In [26]:
salary_test['FullDescription'] = salary_test['FullDescription'].replace('[^a-zA-Z0-9]', ' ', regex = True)

In [28]:
X_FullDescription_test = vectorizer.transform(salary_test['FullDescription'])

In [29]:
X_FullDescription_test

<2x22861 sparse matrix of type '<class 'numpy.float64'>'
	with 300 stored elements in Compressed Sparse Row format>

In [30]:
X_test_categ = enc.transform(salary_test[['LocationNormalized', 'ContractTime']].to_dict('records'))

In [31]:
X_test = scipy.sparse.hstack([X_FullDescription_test, X_test_categ])

In [32]:
model.predict(X_test)

array([56555.61500155, 37188.32442618])