In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

#Data aquisition
data = pd.read_csv('https://raw.githubusercontent.com/insaid2018/Term-2/master/CaseStudy/Advertising.csv', index_col=0) 
print('Data Shape:', data.shape)
data.head()

#Data description and pre-processing
data.describe()

import pandas_profiling as pp

profile = pp.ProfileReport(data)

profile

data.info()

#Exploratory Data analysis
plt.style.use("classic")
sns.jointplot('newspaper','sales', data=data)
plt.show()

plt.style.use("classic")
sns.jointplot('TV','sales', data=data)
plt.show()

plt.style.use("classic")
sns.jointplot('radio','sales', data=data)
plt.show()

plt.style.use("classic")
sns.jointplot('TV','sales',data=data, kind="reg")
plt.show()

sns.pairplot(data,size = 2, aspect= 1.5)
plt.show()

sns.heatmap(data.corr(), annot=True)

'''
1. Better relation ship between TV and sales with 0.78
2. Weak relationship between radio, with sales 0.58
3. very weak relation or no relation between newspaper with sales 
4. correlation can process between +1 to -1 +1 means strong relation ship and -1 means very weak relation ship
5. here in the data, TV and sales are correlation of 0.78 out of all independent variables hence TV and sales are most probable candidate for model building
''' 

feature_cols = ['TV', 'newspaper', 'radio']
X = data[feature_cols]
X.head()

y = data.sales

y

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.40,random_state=0)

print('X Train share:', X_train.shape)
print('X test shape:', X_test.shape)
print('y train shape:', y_train.shape)
print('y test shape:', y_test.shape)

#standardisation 
# Variable that zre measured at different scales do not contribute to the model building and hence we will endup with bias
# to deal with this scenario, we will use standardisation method before model fitting
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[feature_cols] = sc.fit_transform(X_train[feature_cols])
X_test[feature_cols] = sc.transform(X_test[feature_cols])

X_train.describe()

X_test.describe()



# Model Developement and Evaluation
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)

lr

y = lr.coef_

c = lr.intercept_

y

c

feature_cols

feature_cols.insert(0,'Intercept')

feature_cols

coef = lr.coef_.tolist()

coef

coef.insert(0, lr.intercept_)

coef

eq1 = zip(feature_cols, coef)
for c1,c2 in eq1:
    print(c1,c2)



#Using the model for predictions
y_pred_train = lr.predict(X_train)

y_pred_test = lr.predict(X_test)

y_pred_train

y_pred_test

plt.style.use("classic")
sns.regplot(y_test,y_pred_test)
plt.show()

# Model evaluation using metrics 
from sklearn.metrics import mean_absolute_error, mean_squared_error,r2_score
print(mean_absolute_error(y_train,y_pred_train))
print(mean_absolute_error(y_test, y_pred_test))

MSE_train = mean_squared_error(y_train, y_pred_train)
MSE_test = mean_squared_error(y_test, y_pred_test)

RMSE_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
RMSE_test = np.sqrt(mean_squared_error(y_test, y_pred_test))
print(RMSE_train, RMSE_test)

# Model evaluation using r squared and adjusted r squared
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
print(r2_train, r2_test)

adj_r_squared_train = 1 - (1-r2_train)*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1)
adj_r_squared_test = 1 - (1-r2_test)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)

print("Adjusted R2 score for training set is {}".format(adj_r_squared_train))
print("Adjusted R2 score for test set is {}".format(adj_r_squared_test))

'''
Observation :

1) R squared and adjusted r squared values for the training set is observed to be 0.9005511298841222 and 0.8979791763466426
2) R squared and adjusted r squared values for the test data set is ibserved to be 0.8876696235952205 and 0.8832355297897687
'''


#Feature selection
feature_cols = ['TV', 'radio']
X = data[feature_cols]
y = data.sales

X.columns

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.40, random_state=1)

X_train

from sklearn.linear_model import LinearRegression
lr2 = LinearRegression()
lr2.fit(X_train,y_train)

y_pred_train = lr2.predict(X_train)
y_pred_test = lr2.predict(X_test)

y_pred_train

y_pred_test

print(mean_squared_error(y_train,y_pred_train))

print(mean_squared_error(y_test, y_pred_test))

print(np.sqrt(mean_squared_error(y_train,y_pred_train)))
print(np.sqrt(mean_squared_error(y_test, y_pred_test)))

r2_train = r2_score(y_train,y_pred_train)
r2_test = r2_score(y_test,y_pred_test)
print("R2 score for training is :", r2_train)
print("R2 score for test is :", r2_test)

#adjusted R2
adj_r2_train = 1 - (1-r2_train)*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1)
adj_r2_test = 1 - (1-r2_test)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)
print("Adjusted R2 score for training dataset is :", adj_r2_train)
print("adjusted R2 score for test dataset is :", adj_r2_test)

'''Write the observations''

SyntaxError: EOF while scanning triple-quoted string literal (1726867639.py, line 204)