In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['figure.figsize'] = [8,5]
plt.rcParams['font.size'] = 14
plt.rcParams['font.weight'] = 'bold'
plt.style.use('seaborn-whitegrid')



In [None]:
data = pd.read_csv('/kaggle/input/advertising-dataset/advertising.csv')
data.head()

In [None]:
data.describe()

In [None]:
sns.pairplot(data)#will plot each of the numerical attributes in terms of the others. 
plt.show()

In [None]:
plt.figure(figsize=(12,4)) #a more intresting way to see missing values
sns.heatmap(data.isnull(), cbar=False, cmap ='viridis', yticklabels=False)
plt.title('Missing value in the dataset');
#you don't have any

In [None]:
#check if certain attributes correlate with others
#a way for texting the correlation cooeficient for each of the numerical quantities. 
#helps us see if one of them correlates greater with other attribute
corr =data.corr()
sns.heatmap(corr, cmap ='Wistia', annot =True);

In [None]:
f = plt.figure(figsize=(12,4))

ax=f.add_subplot(121)
sns.histplot(data['Sales'], bins=50, color= 'r', ax=ax)
ax.set_title('Distribution of forms of advs' )

ax=f.add_subplot(122)
sns.histplot(data['Sales'], bins=50, color= 'b', log_scale=True, ax=ax)
ax.set_title('Distribution of forms of advs in $log$ scale' )
ax.set_xscale('log');

In [None]:
data['Sales'] = np.log(data['Sales'])
f = plt.figure(figsize=(12,4))

ax=f.add_subplot(121)
sns.histplot(data['Sales'], bins=50, color= 'r', ax=ax)
ax.set_title('Distribution of forms of advs after log transf' )


In [None]:
categorical_columns = ['TV', 'Radio', 'Newspaper']
data_encoded = pd.get_dummies(data = data,
                             columns = categorical_columns,
                             dtype ='int8')
print('columns in original data frame:\n',data.columns.values)
print('\nNumber of rows and columns in the dataset:', data.columns.values)
print('\nColumns in data frame after encoding dummy variable:\n', data_encoded.columns.values)
print('\nNumber of rows and columns in the dataset:', data_encoded.shape)
data_encoded.head()


In [None]:
from sklearn.model_selection import train_test_split
x = data_encoded.drop('Sales', axis=1)
y = data_encoded['Sales']
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3, random_state=23)

In [None]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(x_train, y_train)
R_square_sk = lin_reg.score(x_test, y_test)
print('R square error is:', R_square_sk)

In [None]:
from sklearn import metrics
from sklearn.model_selection import cross_val_score
def cross_val(classifier):
    score = cross_val_score(classifier, x, y, cv=10)
    return score.mean()

def print_evaluate(true, predicted):
    mae = metrics.mean_absolute_error(true, predicted)
    mse = metrics.mean_squared_error(true, predicted)
    rmse = np.sqrt(metrics.mean_squared_error(true, predicted))
    r2_square = metrics.r2_score(true, predicted)
    print('Mean Absolute Error:', mae)
    print('Mean Squared Error:', mse)
    print('Root Mean Squared Error:', rmse)
    print('R2 Square', r2_square)
    
    
print('Cross Validation mean:', cross_val(lin_reg))

test_pred = lin_reg.predict(x_test)
train_pred = lin_reg.predict(x_train)

print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test, test_pred)
print('====================================')
print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train, train_pred)
                                          

In [None]:
columns= ['intersect:x_0=1'] + list(x.columns.values)
theta = [lin_reg.intercept_] + list(lin_reg.coef_)
parameters = pd.DataFrame({'Columns': columns, 'theta': theta})
parameters

In [None]:
f = plt.figure(figsize =(14,5))
ax = f.add_subplot(121)
sns.scatterplot(x=y_test, y=test_pred, ax=ax, color ='r')
ax.set_title('Check for linearity:\n Actual Vs Predicted value')

In [None]:
from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor(max_depth=2)
model = dtr.fit(x_train, y_train)

print('Score on the train data:', dtr.score(x_train, y_train))
print('Score on the test data:', dtr.score(x_test, y_test))


In [None]:
from sklearn import tree
import matplotlib.pyplot as plt
plt.figure(figsize=(40,20))
tree.plot_tree(model, feature_names = x.columns, label = 'root')

In [None]:
import matplotlib.pyplot as plt

plt.figure()
plt.title('Feature importances')
plt.barh(x.columns, model.feature_importances_, 1)
