In [1]:
import pandas as pd 
import numpy as np 
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, accuracy_score 
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [2]:
data = pd.read_csv("C:\\Users\\sanja\\OneDrive\\Pictures\\All working files\data\\advertising (2).csv")
data.head()

Unnamed: 0,TV,Radio,Newspaper,Sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,12.0
3,151.5,41.3,58.5,16.5
4,180.8,10.8,58.4,17.9


In [5]:
data.isnull().sum()

TV           0
Radio        0
Newspaper    0
Sales        0
dtype: int64

In [3]:
df = data.copy()

In [4]:
X = df.drop(columns='Sales', axis=1)
y = df['Sales']

In [5]:
px.scatter_3d(x=X['TV'], y=X['Radio'], z=X['Newspaper'])

In [6]:
X_train, X_test , y_train , y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [7]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_predicted = lr_model.predict(X_test)

In [8]:
r2_score(y_test,y_predicted)

0.9059011844150826

## Steps for PCA
1. Mean centring of data
2. Covariance matrix
3. eigen value and eigen vectors
4. transform original data

In [9]:
# step 1 mean centring
scaler = StandardScaler()
df.iloc[:,0:3] = scaler.fit_transform(df.iloc[:,0:3])

In [10]:
# step 2 covariance matrix
covariance_matrix = np.corrcoef([df.iloc[:,0], df.iloc[:,1], df.iloc[:,2]])
covariance_matrix

array([[1.        , 0.05480866, 0.05664787],
       [0.05480866, 1.        , 0.35410375],
       [0.05664787, 0.35410375, 1.        ]])

In [11]:
# step 3 find eigen values and eigen vectors
eigen_values, eigen_vectors = np.linalg.eig(covariance_matrix)

In [12]:
eigen_values

array([1.37085251, 0.98325614, 0.64589135])

In [13]:
1.37085251 + 0.98325614 + 0.64589135

2.9999999999999996

In [14]:
eigen_vectors

array([[ 0.20787391,  0.97814838, -0.0037659 ],
       [ 0.69139674, -0.14965532, -0.70680537],
       [ 0.69192412, -0.14432267,  0.70739804]])

In [34]:
# assume we want aur original data in 2d so we will select top 2 eigen vectors
pc = eigen_vectors[:3]
pc

array([[ 0.20787391,  0.97814838, -0.0037659 ],
       [ 0.69139674, -0.14965532, -0.70680537],
       [ 0.69192412, -0.14432267,  0.70739804]])

In [35]:
print(df.iloc[:,0:3].shape)
print(pc.shape)

(200, 3)
(3, 3)


In [37]:
df.iloc[:,0:3]

Unnamed: 0,TV,Radio,Newspaper
0,0.969852,0.981522,1.778945
1,-1.197376,1.082808,0.669579
2,-1.516155,1.528463,1.783549
3,0.052050,1.217855,1.286405
4,0.394182,-0.841614,1.281802
...,...,...,...
195,-1.270941,-1.321031,-0.771217
196,-0.617035,-1.240003,-1.033598
197,0.349810,-0.942899,-1.111852
198,1.594565,1.265121,1.640850


In [40]:
transform_values = np.dot(df.iloc[:,0:3], pc.T)
new_df = pd.DataFrame(transform_values, columns=['PC1', 'PC2',"PC3"])
new_df['Sales'] = df['Sales'].values
new_df

Unnamed: 0,PC1,PC2,PC3,Sales
0,1.154982,-0.733706,1.787831,22.1
1,0.807722,-1.463172,-0.511109,10.4
2,1.173178,-2.537629,-0.007977,12.0
3,1.197218,-1.055509,0.770251,16.5
4,-0.746110,-0.507496,1.300952,17.9
...,...,...,...,...
195,-1.553455,-0.135925,-1.234297,7.6
196,-1.337280,0.489510,-0.979147,14.0
197,-0.845392,1.168831,-0.408399,14.8
198,1.562766,-0.246617,2.081467,25.5


In [41]:
X_new = new_df.drop(columns="Sales", axis=1)
y_new = new_df['Sales']

In [42]:
X_new_train, X_new_test, y_new_train, y_new_test = train_test_split(X_new,y_new,test_size=0.2,
                                                                    random_state=42)

In [43]:
lr_model2 = LinearRegression()
lr_model2.fit(X_new_train,y_new_train)
y_new_predicted = lr_model2.predict(X_new_test)
r2_score(y_new_test, y_new_predicted)

0.9059011844150825

In [22]:
X = df.drop(columns='Sales', axis=1)
y = df['Sales']

In [23]:
X

Unnamed: 0,TV,Radio,Newspaper
0,0.969852,0.981522,1.778945
1,-1.197376,1.082808,0.669579
2,-1.516155,1.528463,1.783549
3,0.052050,1.217855,1.286405
4,0.394182,-0.841614,1.281802
...,...,...,...
195,-1.270941,-1.321031,-0.771217
196,-0.617035,-1.240003,-1.033598
197,0.349810,-0.942899,-1.111852
198,1.594565,1.265121,1.640850


In [24]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [25]:
# now lets use scikit learn's PCA
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.fit_transform(X_test)

# now apply linear regression
lr_model3 = LinearRegression()
lr_model3.fit(X_train_pca, y_train)
y_predicted_pca = lr_model3.predict(X_test_pca)
r2_score(y_test, y_predicted_pca)

0.6905814674328943

In [26]:
X_train

Unnamed: 0,TV,Radio,Newspaper
79,-0.362479,-1.050937,-0.343121
197,0.349810,-0.942899,-1.111852
38,-1.213724,0.232011,0.209261
24,-0.989528,-0.720071,-0.564074
122,0.898623,-1.408812,-0.688360
...,...,...,...
106,-1.425075,-0.828109,-0.039311
14,0.666253,0.650657,0.711007
92,0.825059,0.691171,1.309421
179,0.216694,-0.895633,-0.596296


In [27]:
model4 = LinearRegression()
model4.fit(X_train,y_train)
y_pre = model4.predict(X_test)
r2_score(y_test,y_pre)

0.9059011844150826