## Question 0: Salary Prediction

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
import time

In [2]:
# read file
originalsalary = pd.read_csv("http://www.webpages.uidaho.edu/~stevel/Datasets/salary_uk.csv")

## Data Preprocessing

In [3]:
salary = originalsalary
# preprocessing

#get rid of useless columns
salary = salary.drop(['LocationRaw', 'SalaryRaw', 'SourceName'], axis=1)

#this loop does preprocessing on columns that are strings
for n in salary.columns:
    if salary[n].dtype == 'object':
        salary[n] = salary[n].str.lower()
        salary[n] = salary[n].str.strip()

#replace null values with the mode (most frequent occurence in the feature)
salary['Title'].fillna(value=salary['Title'].value_counts().axes[0][0], inplace=True)
salary['ContractType'].fillna(value=salary['ContractType'].value_counts().axes[0][0], inplace=True)
salary['ContractTime'].fillna(value=salary['ContractTime'].value_counts().axes[0][0], inplace=True)
salary['Company'].fillna(value=salary['Company'].value_counts().axes[0][0], inplace=True)

In [4]:
#tf-idf
#first we create vector using only 10000 words and removing useless stop words
#we just consider words that matter to the target variable (salary)
vector = TfidfVectorizer(stop_words='english', max_features=10000)
fd = pd.DataFrame(vector.fit_transform(salary.FullDescription).toarray(), columns = vector.get_feature_names())
salary = pd.concat([salary.drop('FullDescription', axis=1), fd], axis=1)

In [5]:
#preprocess categorical features
salary = pd.get_dummies(salary, columns=['Title', 'LocationNormalized', 'ContractTime', 'ContractType', 'Company', 'Category'], prefix=['Title', 'LocationNormalized', 'ContractTime', 'ContractType', 'Company', 'Category'])


## Training and Testing Models

In [6]:
#split dataset into training and testing 80% 20%
X = salary.drop('SalaryNormalized', axis=1)
Y = salary.SalaryNormalized
pca = PCA(n_components=1000)
pca.fit(X)
X = pca.transform(X)
xTrain, xTest, yTrain, yTest = train_test_split(X, Y, random_state=int(time.time()), test_size=0.20)

## Linear Regression

In [7]:
lr = LinearRegression().fit(xTrain, yTrain)

print("lr.coef_:", lr.coef_)
print("lr.intercept_:", lr.intercept_)

print("Training set score: {:.2f}".format(lr.score(xTrain, yTrain)))
print("Test set score: {:.2f}".format(lr.score(xTest, yTest)))

lr.coef_: [-1.68506054e-04 -2.18544942e+03  1.33903345e+04  4.15710762e+03
 -4.09820590e+02  6.11782775e+03 -3.72618227e+03  5.86864864e+03
  1.18676966e+04 -2.45909062e+03  1.42076357e+03 -4.44471327e+03
 -6.11165549e+03  5.44080705e+02 -8.46932603e+03 -6.08845224e+02
 -7.94344566e+03 -6.66260643e+02 -6.50840444e+01 -4.51696185e+03
  3.57893146e+03  4.45006216e+03  1.43016622e+03  6.48329254e+03
 -3.34316864e+03  2.08883652e+03 -2.52713626e+03 -9.10669117e+03
  7.74476292e+01  1.47293891e+03 -2.22769799e+03  3.33058253e+03
  1.05151210e+04  1.03817115e+04  3.90101945e+03 -6.08273844e+03
  4.27603670e+03 -2.96943679e+03  6.30195029e+03  1.69346433e+03
  2.87082836e+03  4.62368314e+03 -1.04084131e+03 -1.56699171e+04
 -3.57485225e+03  5.82276600e+03 -9.26781243e+02  9.91568137e+03
 -2.50222777e+03  2.03246546e+03  1.02949644e+04 -1.48466416e+04
  7.89270997e+02 -9.61666789e+03 -7.75714555e+02 -4.42278526e+03
  4.45340092e+03  1.65829196e+03  8.58971151e+03 -1.02849678e+04
  1.25276852e+0

## Ridge Regression

In [8]:
ridge = Ridge().fit(xTrain, yTrain)
print("Training set score: {:.2f}".format(ridge.score(xTrain, yTrain)))
print("Test set score: {:.2f}".format(ridge.score(xTest, yTest)))

Training set score: 0.68
Test set score: 0.59


Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number2.853767e-18
  overwrite_a=True).T


## Decision Trees

In [9]:
#tree = DecisionTreeClassifier(random_state=int(time.time()))
#tree.fit(xTrain, yTrain)
#print("Accuracy on training set: {:.3f}".format(tree.score(xTrain, yTrain)))
#print("Accuracy on test set: {:.3f}".format(tree.score(xTest, yTest)))

## K-Nearest Neighbors

In [10]:
clf = KNeighborsClassifier(n_neighbors=3)
clf.fit(xTrain, yTrain)
print("Test set predictions:", clf.predict(xTest))
print("Test set accuracy: {:.2f}".format(clf.score(xTest, yTest)))

Test set predictions: [20000 26000 14000 ... 25500 37500 32500]
Test set accuracy: 0.07


## Support Vector Machines

In [11]:
#svc = SVC()
#svc.fit(xTrain, yTrain)

#print("Accuracy on training set: {:.2f}".format(svc.score(xTrain, yTrain)))
#print("Accuracy on test set: {:.2f}".format(svc.score(xTest, yTest)))

## Gradient Boosted Regression

In [None]:
gbrt = GradientBoostingClassifier(random_state=0)
gbrt.fit(xTrain, yTrain)

print("Accuracy on training set: {:.3f}".format(gbrt.score(xTrain, yTrain)))
print("Accuracy on test set: {:.3f}".format(gbrt.score(xTest, yTest)))

## Conclusions