# Improving Regression
1. Base Line (mean prediction)
2. Data Cleaning & Preparation
3. Linear Regression Model
4. Feature Engg.
5. Tunning
6. Cross Validation
7. Residual Analysis

In [12]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error,r2_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [41]:
data = {     
    'StudyHours': [2, 3, 4, 5, 6, 7, 8, 9, 10],     
    'Attendance': [60, 65, 70, 72, 75, 80, 85, 88, 90],     
    'PrevScore': [40, 45, 50, 55, 60, 65, 70, 75, 80],     
    'FinalMarks': [50, 55, 60, 65, 70, 75, 78, 85, 88] 
}
df=pd.DataFrame(data)
df

Unnamed: 0,StudyHours,Attendance,PrevScore,FinalMarks
0,2,60,40,50
1,3,65,45,55
2,4,70,50,60
3,5,72,55,65
4,6,75,60,70
5,7,80,65,75
6,8,85,70,78
7,9,88,75,85
8,10,90,80,88


In [42]:
y=df['FinalMarks']
y

0    50
1    55
2    60
3    65
4    70
5    75
6    78
7    85
8    88
Name: FinalMarks, dtype: int64

In [43]:
y_pred_baseline=np.repeat(y.mean(),len(y))

In [44]:
mae=mean_absolute_error(y,y_pred_baseline)
print(mae)

10.716049382716049


In [45]:
mse=mean_squared_error(y,y_pred_baseline)
print(mse)

154.02469135802468


In [46]:
rmse=np.sqrt(mse)

In [47]:
print(rmse)

12.410668449282847


In [48]:
r2 = r2_score(y, y_pred_baseline)
print(f'Baseline MAE={mae:.2f}, RMSE={rmse:.2f}, R-Squared={r2:2f}')

Baseline MAE=10.72, RMSE=12.41, R-Squared=0.000000


In [49]:
# clean the data
print(df.isnull().sum())

StudyHours    0
Attendance    0
PrevScore     0
FinalMarks    0
dtype: int64


In [50]:
print(df.isna().sum())

StudyHours    0
Attendance    0
PrevScore     0
FinalMarks    0
dtype: int64


In [51]:
df=df.fillna(df.mean())

In [52]:
df

Unnamed: 0,StudyHours,Attendance,PrevScore,FinalMarks
0,2,60,40,50
1,3,65,45,55
2,4,70,50,60
3,5,72,55,65
4,6,75,60,70
5,7,80,65,75
6,8,85,70,78
7,9,88,75,85
8,10,90,80,88


In [63]:
# Data prep, find input, output
x=df[['StudyHours','Attendance','PrevScore']]
y=df['FinalMarks']
x_train, x_test, y_train, y_test=train_test_split(x,y,test_size=0.2, random_state=0)

In [64]:
# Model Creation
model=LinearRegression()

In [65]:
model.fit(x_train,y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [66]:
print('Original Final Marks')
print(y)
y_all_prediction=model.predict(x)
print('Predicted all FinalMarks')
print(y_all_prediction)

Original Final Marks
0    50
1    55
2    60
3    65
4    70
5    75
6    78
7    85
8    88
Name: FinalMarks, dtype: int64
Predicted all FinalMarks
[50.58653846 55.07692308 59.56730769 64.66346154 69.55769231 74.04807692
 78.53846154 83.43269231 88.52884615]


In [67]:
y_pred=model.predict(x_test)
print(y_pred)

[83.43269231 59.56730769]


In [68]:
mae=mean_absolute_error(y_test,y_pred)

In [69]:
mse=mean_squared_error(y_test,y_pred)

In [70]:
rmse=np.sqrt(mse)

In [60]:
r2=r2_score(y_test,y_pred)

In [61]:
print(f'Baseline MAE={mae:.2f}, RMSE={rmse:.2f}, R-Squared={r2:.2f}')

Baseline MAE=1.00, RMSE=1.19, R-Squared=0.99


# Base Line (mean prediction)
2. Data Cleaning & Preparation
3. Linear Regression Model
4. Find out MAE, RMSE, R-Squared
5. Data Set as follows
-------------------------------------------------------------
   data = {
       'Area': [850, 900, 1000, 1100,1200, 1500, 16000, 1800, 2000]
       'Bedroom':[1,2,2,2,3,3,2,4,4],
       'Age': [1,1,3,2,1,2,2,1,2],
       'PriceIn100K':[5,6,7,7,70,9,9,11,None]
    }

In [71]:
data = {
    'Area': [850, 900, 1000, 1100,1200, 1500, 16000, 1800, 2000],
    'Bedroom':[1,2,2,2,3,3,2,4,4],
    'Age': [1,1,3,2,1,2,2,1,2],
    'PriceIn100K':[5,6,7,7,70,9,9,11,15]
}

In [72]:
df=pd.DataFrame(data)
df

Unnamed: 0,Area,Bedroom,Age,PriceIn100K
0,850,1,1,5
1,900,2,1,6
2,1000,2,3,7
3,1100,2,2,7
4,1200,3,1,70
5,1500,3,2,9
6,16000,2,2,9
7,1800,4,1,11
8,2000,4,2,15


In [73]:
print(df.isnull().sum())

Area           0
Bedroom        0
Age            0
PriceIn100K    0
dtype: int64


In [74]:
print(df.isna().sum())

Area           0
Bedroom        0
Age            0
PriceIn100K    0
dtype: int64


In [75]:
df=df.fillna(df.mean())

In [76]:
df

Unnamed: 0,Area,Bedroom,Age,PriceIn100K
0,850,1,1,5
1,900,2,1,6
2,1000,2,3,7
3,1100,2,2,7
4,1200,3,1,70
5,1500,3,2,9
6,16000,2,2,9
7,1800,4,1,11
8,2000,4,2,15


In [78]:
# Data prep, find input, output
x=df[['Area','Bedroom','Age']]
y=df['PriceIn100K']
x_train, x_test, y_train, y_test=train_test_split(x,y,test_size=0.2, random_state=0)

In [79]:
model=LinearRegression()

In [80]:
model.fit(x,y)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [81]:
y_pred=model.predict(x_test)
print(y_pred)

[29.56470192  0.42295623]


In [82]:
mae=mean_absolute_error(y_test,y_pred)

In [83]:
mse=mean_squared_error(y_test,y_pred)

In [84]:
rmse=np.sqrt(mse)

In [85]:
r2=r2_score(y_test,y_pred)

In [86]:
print(f'Baseline MAE={mae:.2f}, RMSE={rmse:.2f}, R-Squared={r2:.2f}')

Baseline MAE=12.57, RMSE=13.93, R-Squared=-47.49
