# TP5. Regression - Ex.1

#### a)  Importar os dados - Advertising.csv 

In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns

- Target Variable: 
  - Sales
- Predictors:
  - TV - Budget of advertisements in TV
  - Radio - Budget of advertisements in radio
  - Newspaper - Budget of advertisements in newspaper

In [None]:
df = pd.read_csv('Advertising.csv', usecols=[1,2,3,4]) ## em vez index_col = 0

#### b) Analisar os dados - alguns exemplos

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.describe()

### Exploratory Data Analysis

In [None]:
# Null values
df.isnull().sum()*100/df.shape[0]

In [None]:
# Outliers 
fig, axs = plt.subplots(3, figsize = (3,3))
plt1 = sns.boxplot(df['TV'], ax = axs[0])
plt2 = sns.boxplot(df['newspaper'], ax = axs[1])
plt3 = sns.boxplot(df['radio'], ax = axs[2])
plt.tight_layout()

In [None]:
# visualize the relationship between the features and the target using scatterplots

sns.pairplot(data=df, x_vars=['TV','radio','newspaper'], y_vars='sales', height =2, aspect = 1);


In [None]:
sns.pairplot(data=df, x_vars=['TV', 'radio', 'newspaper'], y_vars='sales', height=3, aspect=0.7, kind='reg');


In [None]:
#sns.pairplot(data=df[['TV','radio','newspaper']], height = 2, aspect = 1);

In [None]:
#Correlation Matrix
df.corr()

In [None]:
sns.heatmap(df.corr(), annot = True)

### Construção do Modelo de Regressão

### c) Holdout

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import scale 

X = df[['TV']].to_numpy() 
y = df.sales

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, test_size = 0.3, 
                                                    random_state = 100)

print("X_train size:", X_train.shape)
print("y_train size:", y_train.shape)
print("\nX_test size:", X_test.shape)
print("y_test size:", y_test.shape)

### d) Simple Linear Regression 

In [None]:
## Initialize algorithm
regr = LinearRegression()

## Fit the data
regr.fit(X_train,y_train)

print("Eq. da reta: y=", regr.intercept_, "+", regr.coef_, "x")

In [None]:
sns.regplot(x="TV", y="sales", data=df, order=1, ci=None, scatter_kws={'color':'r', 's':9})
plt.xlim(-10,310)
plt.ylim(ymin=0);

In [None]:
### Evaluate the model
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

y_pred = regr.predict(X_test)

MAE = np.mean(abs(y_test - y_pred)) #efetuando os cálculos
print("MAE on training set: {:.3f}".format(MAE))

#ou
#print("MAE:", mean_absolute_error(y_test, y_pred))


### e) Multiple Linear Regression

In [None]:
X = df[['radio', 'TV', 'newspaper']].to_numpy() 
y = df.sales

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 100)

In [None]:
#Fitting the Multiple Linear Regression model
mlr = LinearRegression()  
mlr.fit(X_train, y_train)

In [None]:
#Intercept and Coefficient
print("Intercept: ", mlr.intercept_)
print("Coefficients:")
list(zip(df, mlr.coef_))

Regression Equation: Sales = 2.6528 + (0.1898 * TV) + (0.0454* Radio) + (0.0046 * Newspaper)

In [None]:
#Prediction of test set
y_pred_mlr= mlr.predict(X_test)
#Predicted values
#print("Prediction for test set: {}".format(y_pred_mlr))

In [None]:
#Actual value and the predicted value
mlr_diff = pd.DataFrame({'Actual value': y_test, 'Predicted value': y_pred_mlr})
mlr_diff.head()

In [None]:
#Model Evaluation
from sklearn import metrics

meanAbErr = metrics.mean_absolute_error(y_test, y_pred_mlr)
meanSqErr = metrics.mean_squared_error(y_test, y_pred_mlr)

rootMeanSqErr = np.sqrt(metrics.mean_squared_error(y_test, y_pred_mlr))

print('R squared: {:.2f}'.format(mlr.score(X,y)*100))
print('Mean Absolute Error:', meanAbErr)
print('Mean Square Error:', meanSqErr)
print('Root Mean Square Error:', rootMeanSqErr)

### f) Simplify the Model

In [None]:
X = df[['radio', 'TV']].to_numpy() 
y = df.sales

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 100)

In [None]:
#Fitting the Multiple Linear Regression model
mlr = LinearRegression()  
mlr.fit(X_train, y_train)

In [None]:
#Intercept and Coefficient
print("Intercept: ", mlr.intercept_)
print("Coefficients:")
list(zip(df, mlr.coef_))

In [None]:
#Prediction of test set
y_pred_mlr= mlr.predict(X_test)
#Predicted values
print("Prediction for test set: {}".format(y_pred_mlr))

In [None]:
#Actual value and the predicted value
mlr_diff = pd.DataFrame({'Actual value': y_test, 'Predicted value': y_pred_mlr})
mlr_diff.head()

In [None]:
#Model Evaluation
meanAbErr = metrics.mean_absolute_error(y_test, y_pred_mlr)
meanSqErr = metrics.mean_squared_error(y_test, y_pred_mlr)
rootMeanSqErr = np.sqrt(metrics.mean_squared_error(y_test, y_pred_mlr))
print('R squared: {:.2f}'.format(mlr.score(X,y)*100))
print('Mean Absolute Error:', meanAbErr)
print('Mean Square Error:', meanSqErr)
print('Root Mean Square Error:', rootMeanSqErr)

In [None]:
# Create a coordinate grid
radio = np.arange(0,50)
TV = np.arange(0,300)

B1, B2 = np.meshgrid(radio, TV, indexing='xy')
Z = np.zeros((TV.size, radio.size))

for (i,j),v in np.ndenumerate(Z):
        Z[i,j] =(mlr.intercept_ + B1[i,j]*mlr.coef_[0] + B2[i,j]*mlr.coef_[1])

In [None]:
        
# Create plot
fig = plt.figure(figsize=(10, 6))
fig.suptitle('Regression: sales ~ radio + TV', fontsize=12)

ax = fig.add_subplot(projection='3d')

ax.plot_surface(B1, B2, Z, rstride=180, cstride=5, alpha=0.4)
ax.scatter3D(df.radio, df.TV, df.sales, c='r')
#ax.scatter3D(df.radio, df.TV, df.sales, c=y, s=10, cmap='viridis')
ax.set_xlabel('radio')
ax.set_xlim(0,50)   
ax.set_ylabel('TV')
ax.set_ylim(ymin=0)
ax.set_zlabel('sales')

### g) Regression tree
(https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html#sklearn-tree-decisiontreeregressor)

### i) MAE and RMSE

In [None]:
X = df[['radio', 'TV', 'newspaper']].to_numpy() 
y = df.sales

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 100)

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error

#tree = DecisionTreeRegressor(random_state=42)
tree = DecisionTreeRegressor(random_state=42, max_depth=6, min_samples_split=3)# Restrição de alguns parâmetros da árvore
model = tree.fit(X_train, y_train)

y_pred = tree.predict(X_train)
y_pred1 = tree.predict(X_test)

In [None]:
MAE1 = metrics.mean_absolute_error(y_train, y_pred)
MAE2 = metrics.mean_absolute_error(y_test, y_pred1)
MAE3 = np.mean(abs(y_test-y_pred1)) #efetuando os cálculos

print("MAE on training set: {:.3f}".format(MAE1))
print("MAE on test set: {:.3f}".format(MAE2))
print("MAE on test set: {:.3f}".format(MAE3)) #efetuando os cálculos

RMSE = np.sqrt(np.mean((y_test-y_pred1)**2))
print("RMSE: {:.4f}".format(RMSE))


### h) Regression tree vizualization

In [None]:
# plot tree
from sklearn import tree

tree.plot_tree(model, 
          feature_names = list(df.columns), 
          class_names = list(df['sales']),  
          filled=True,                    
          fontsize=6);

In [None]:
## the model:
from sklearn.tree import export_text
print(export_text(model,
                  show_weights=True))