In [0]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.model_selection import cross_val_score, KFold
from sklearn import preprocessing as prep
from sklearn import utils
from sklearn import svm
from sklearn.model_selection import GridSearchCV
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, precision_recall_curve
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

In [0]:
dataset_url = '/content/drive/My Drive/coursera/Week 4/week_4_salary-train.csv'
train = pd.read_csv(dataset_url)

dataset_url = '/content/drive/My Drive/coursera/Week 4/week_4_salary-test-mini.csv'
test = pd.read_csv(dataset_url)

In [49]:
train

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,International Sales Manager London ****k ****...,London,permanent,33000
1,An ideal opportunity for an individual that ha...,London,permanent,50000
2,Online Content and Brand Manager// Luxury Reta...,South East London,permanent,40000
3,A great local marketleader is seeking a perman...,Dereham,permanent,22500
4,Registered Nurse / RGN Nursing Home for Young...,Sutton Coldfield,,20355
...,...,...,...,...
59995,"As a result of continued growth, First Class S...",Whitley Bay,contract,26400
59996,PHP / MVC Web Developer MacclesfieldCirca ***...,Macclesfield,permanent,26000
59997,"Staff Nurse, Nursing Home, Baldock White Recru...",Baldock,,24500
59998,This is one of the best agency side opportunit...,The City,permanent,65000


In [0]:
X_train = train.drop("SalaryNormalized", axis=1)
y_train = train["SalaryNormalized"]

In [0]:
X_train = X_train.replace('[^a-zA-Z0-9]', ' ', regex = True)

In [52]:
X_train.isna().sum()

FullDescription           0
LocationNormalized        0
ContractTime          15582
dtype: int64

In [0]:
X_train['LocationNormalized'].fillna('nan', inplace=True)
X_train['ContractTime'].fillna('nan', inplace=True)

In [54]:
X_train.isna().sum()

FullDescription       0
LocationNormalized    0
ContractTime          0
dtype: int64

In [0]:
X_train = X_train.apply(lambda x: x.map(str.lower))

In [56]:
X_train

Unnamed: 0,FullDescription,LocationNormalized,ContractTime
0,international sales manager london k ...,london,permanent
1,an ideal opportunity for an individual that ha...,london,permanent
2,online content and brand manager luxury reta...,south east london,permanent
3,a great local marketleader is seeking a perman...,dereham,permanent
4,registered nurse rgn nursing home for young...,sutton coldfield,
...,...,...,...
59995,as a result of continued growth first class s...,whitley bay,contract
59996,php mvc web developer macclesfieldcirca ...,macclesfield,permanent
59997,staff nurse nursing home baldock white recru...,baldock,
59998,this is one of the best agency side opportunit...,the city,permanent


In [0]:
enc = DictVectorizer()
vectorizer = TfidfVectorizer(min_df=5)

In [0]:
X_train_vec = vectorizer.fit_transform(X_train['FullDescription'])
X_test_vec = vectorizer.transform(test['FullDescription'])

In [0]:
X_train_categ = enc.fit_transform(X_train[['LocationNormalized', 'ContractTime']].to_dict('records'))
X_test_categ = enc.transform(X_test[['LocationNormalized', 'ContractTime']].to_dict('records'))

In [0]:
X_for_train = hstack([X_train_vec, X_train_categ])
X_for_test = hstack([X_test_vec, X_test_categ])

In [0]:
from sklearn.linear_model import Ridge

clf_ridge = Ridge(alpha = 1, random_state = 241)
clf_ridge.fit(X_for_train, y_train)
y_pred = clf_ridge.predict(X_for_test)

In [62]:
np.round(y_pred, 2)

array([55865.19, 37144.01])

In [63]:
dataset_url = '/content/drive/My Drive/coursera/Week 4/week_4_close_prices.csv'
data = pd.read_csv(dataset_url)
data

Unnamed: 0,date,AXP,BA,CAT,CSCO,CVX,DD,DIS,GE,GS,HD,IBM,INTC,JNJ,JPM,KO,MCD,MMM,MRK,MSFT,NKE,PFE,PG,T,TRV,UNH,UTX,V,VZ,WMT,XOM
0,2013-09-23,76.440002,117.510002,85.029999,24.270000,125.519997,59.409999,64.750000,24.280001,165.250000,75.910004,190.990005,23.620001,89.089996,51.459999,38.630001,97.279999,121.110001,47.680000,32.740002,68.980003,28.799999,79.279999,34.220001,86.379997,71.820000,109.419998,196.240005,47.980000,76.419998,87.750000
1,2013-09-24,76.070000,119.000000,85.110001,24.139999,124.489998,59.319997,64.320000,24.320000,162.970001,76.040001,189.970001,23.700001,88.220001,50.320000,38.529999,97.779999,120.800003,47.529999,32.450001,69.430000,28.709999,78.620003,34.090000,85.870003,72.320000,110.000000,193.339996,47.270000,75.750000,87.360001
2,2013-09-25,75.989998,118.510002,84.500000,24.430000,124.070000,59.319997,64.449997,24.230000,162.309998,75.519997,189.470001,23.700001,87.080002,51.700001,38.330002,97.620003,120.199997,47.669998,32.509998,68.919998,28.490000,77.720001,34.049999,85.980003,71.980003,109.260002,191.559998,46.950001,74.650002,87.139999
3,2013-09-26,76.320000,119.379997,84.199997,23.770000,123.489998,59.509996,65.239998,24.250000,162.289993,76.070000,190.220001,23.410000,87.070000,51.889999,38.740002,98.190002,120.660004,47.680000,32.770000,70.339996,28.520000,78.050003,34.230000,85.830002,72.160004,109.660004,193.559998,47.669998,74.620003,87.070000
4,2013-09-27,75.889999,118.739998,83.800003,23.330000,122.639999,59.009995,65.190002,24.049999,159.850006,75.959999,186.919998,22.980000,86.730003,52.240002,38.400002,97.120003,120.220001,47.790001,33.270000,73.639999,28.879999,77.209999,33.980000,85.410004,71.989998,109.360001,193.050003,47.000000,74.360001,86.900002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
369,2015-03-12,81.559998,152.039993,80.190002,28.240000,102.419998,80.400001,107.169998,25.400000,189.949997,116.099998,157.979996,30.799999,99.830002,61.369999,40.570000,96.250000,164.470001,56.169998,41.020000,97.040001,34.000000,82.089996,33.130001,107.830002,114.879997,121.239998,269.579987,48.730000,81.900002,84.220001
370,2015-03-13,80.599998,151.570007,79.230003,27.940001,101.620003,80.499997,106.440002,25.040001,189.339996,114.820000,154.279999,30.930000,99.209999,61.000000,39.910000,96.349998,162.740005,56.200001,41.380001,95.809998,34.000000,81.830002,32.759998,106.720001,115.250000,118.739998,265.029999,48.840000,81.900002,83.870003
371,2015-03-16,81.500000,153.669998,79.970001,28.299999,103.129997,77.069994,107.370003,25.450001,191.899994,116.500000,157.080002,30.830000,101.059998,61.849998,40.290001,97.150002,166.210007,57.119999,41.560001,96.440002,34.439999,83.559998,33.060001,108.930000,118.519997,120.070000,269.019989,49.270000,83.290001,84.760002
372,2015-03-17,81.059998,154.509995,78.449997,28.150000,103.169998,74.680001,106.959999,25.309999,190.570007,115.580002,156.960007,30.590000,99.889999,61.610001,40.509998,96.169998,164.229996,56.470001,41.700001,96.540001,34.169998,82.849998,32.900002,108.750000,117.599998,119.860001,264.500000,48.950001,82.620003,84.080002


In [65]:
from sklearn.decomposition import PCA
pca = PCA(n_components=10)
pca.fit(data.drop(['date'],axis=1))

PCA(copy=True, iterated_power='auto', n_components=10, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [66]:
print(pca.explained_variance_ratio_)

[0.73897118 0.11007169 0.04995088 0.0287492  0.02215448 0.01931577
 0.00674853 0.00614091 0.00320594 0.00305611]


In [72]:
sum(pca.explained_variance_ratio_[:4])

0.9277429537836402

In [67]:
print(pca.singular_values_)

[753.53936404 290.82384392 195.9132354  148.62951526 130.4736952
 121.82829504  72.01066717  68.69238032  49.6329401   48.4592519 ]


In [0]:
print(pca.components_)

In [0]:
X = pca.transform(data.drop(['date'],axis=1))

In [0]:
X.T[0]

In [0]:
first_component = X.T[0]
first_component

In [97]:
dataset_url = '/content/drive/My Drive/coursera/Week 4/week_4_djia_index.csv'
data_jones = pd.read_csv(dataset_url)
data_jones

Unnamed: 0,date,^DJI
0,2013-09-23,15401.379883
1,2013-09-24,15334.589844
2,2013-09-25,15273.259766
3,2013-09-26,15328.299805
4,2013-09-27,15258.240234
...,...,...
369,2015-03-12,17895.220703
370,2015-03-13,17749.310547
371,2015-03-16,17977.419922
372,2015-03-17,17849.080078


In [98]:
np.corrcoef(first_component, data_jones.iloc[:,1])

array([[1.        , 0.90965222],
       [0.90965222, 1.        ]])

In [81]:
print(sorted(zip(pca.components_[0],data.columns[1:]), key=lambda x: x[0])[::-1])

[(0.5796839457473608, 'V'), (0.3296155838158525, 'MMM'), (0.32156401737619067, 'UNH'), (0.2889960291670742, 'HD'), (0.251227032090898, 'GS'), (0.2339062895179173, 'DIS'), (0.21188886780132188, 'NKE'), (0.18947974464619466, 'TRV'), (0.1206449232286106, 'BA'), (0.11408956671451208, 'DD'), (0.09313201682640561, 'INTC'), (0.09139484029206306, 'JNJ'), (0.08716143338286819, 'WMT'), (0.07773169539162303, 'PG'), (0.0762295698765115, 'MSFT'), (0.07138971328880254, 'MRK'), (0.053683487328683256, 'UTX'), (0.050484236916854025, 'CSCO'), (0.04698793402659355, 'JPM'), (0.02905494171554044, 'KO'), (0.023092294068853506, 'PFE'), (0.01613838398473476, 'AXP'), (0.00010912222981037401, 'VZ'), (-0.006205137490081555, 'GE'), (-0.00720594589500823, 'T'), (-0.026106882759563018, 'MCD'), (-0.042942141992919666, 'XOM'), (-0.051661271080882996, 'CAT'), (-0.12585993303584916, 'CVX'), (-0.26499879525484726, 'IBM')]
