# 1 Base Line (mean prediction)
# 2. Data Cleaning & preparation
# 3. Linear Regression Model
# 4. Feature Engg.
# 5. Tunning
# 6. Cross Validation
# 7. Residual Analysis

In [19]:
import numpy as np
import pandas as  pd
from sklearn.metrics import mean_absolute_error, mean_squared_error,r2_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [56]:
data = {
    'StudyHours': [2, 3, 4, 5, 6, 7, 8, 9, 10],
    'Attendance': [60, 65, 70, 72, 75, 80, 85, 88, 90],
    'PrevScore': [40, 45, 50, 55, 60, 65, 70, 75, 80],
    'FinalMarks': [50, 55, 60, 65, 70, 75, 78, 85, 88]
}
df=pd.DataFrame(data)
df

Unnamed: 0,StudyHours,Attendance,PrevScore,FinalMarks
0,2,60,40,50
1,3,65,45,55
2,4,70,50,60
3,5,72,55,65
4,6,75,60,70
5,7,80,65,75
6,8,85,70,78
7,9,88,75,85
8,10,90,80,88


In [57]:
y=df['FinalMarks']
y

0    50
1    55
2    60
3    65
4    70
5    75
6    78
7    85
8    88
Name: FinalMarks, dtype: int64

In [58]:
y_pred_baseline=np.repeat(y.mean(),len(y))


In [59]:
mae=mean_absolute_error(y,y_pred_baseline)
print(mae)

10.716049382716049


In [60]:
mse=mean_squared_error(y,y_pred_baseline)

In [61]:
print(mse)

154.02469135802468


In [62]:
rmse=np.sqrt(mse)

In [63]:
print(rmse)

12.410668449282847


In [64]:
r2 = r2_score(y, y_pred_baseline)

print(f"Baseline MAE={mae:.2f}, RMSE={rmse:.2f}, R-Squared={r2:.2f}")

Baseline MAE=10.72, RMSE=12.41, R-Squared=0.00


In [14]:
#clean the data 
print(df.isnull().sum())

StudyHours    0
Attendance    0
PrevScore     0
FinalMarks    0
dtype: int64


In [16]:
df=df.fillna(df.mean())

In [17]:
df

Unnamed: 0,StudyHours,Attendance,PrevScore,FinalMarks
0,2,60,40,50
1,3,65,45,55
2,4,70,50,60
3,5,72,55,65
4,6,75,60,70
5,7,80,65,75
6,8,85,70,78
7,9,88,75,85
8,10,90,80,88


In [86]:
# Data prep, find input , output
X=df[['StudyHours','Attendance','PrevScore']]
y=df['FinalMarks']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

In [87]:
#Model Creation
model=LinearRegression()

In [88]:
model.fit(X_train,y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [89]:
print('Original Final Marks')
print(y)
y_all_prediction=model.predict(X)
print('Predicted all Final Marks')
print(y_all_prediction)

Original Final Marks
0    50
1    55
2    60
3    65
4    70
5    75
6    78
7    85
8    88
Name: FinalMarks, dtype: int64
Predicted all Final Marks
[50.58653846 55.07692308 59.56730769 64.66346154 69.55769231 74.04807692
 78.53846154 83.43269231 88.52884615]


In [90]:
y_pred=model.predict(X_test)
print(y_pred)

[83.43269231 59.56730769]


In [91]:
mae=mean_absolute_error(y_test,y_pred)


In [92]:
mse=mean_squared_error(y_test,y_pred)


In [93]:
rmse=np.sqrt(mse)


In [94]:
r2=r2_score(y_test,y_pred)


In [95]:
print(f"Baseline MAE={mae:.2f}, RMSE={rmse:.2f}, R-Squared={r2:.2f}")

Baseline MAE=1.00, RMSE=1.15, R-Squared=0.99


# Base Line (mean prediction)
2. Data Cleaning & preparation
3. Linear Regression Model
4. Find out MAE, RMSE, R-squared
5. Data Set as follows
-------------------------------------------------------------
 data = {
    'Area': [850, 900, 1000, 1100, 1200, 1500, 16000, 1800, 2000],
    'Bedrooms': [1, 2, 2, 2, 3, 3, 2, 4, 4],
    'Age': [1, 1, 3, 2, 1, 2, 2, 1, 2],
    'PriceIn100K': [5, 6, 7, 7, 70, 9, 9, 11,None]
}

In [114]:
 data = {
    'Area': [850, 900, 1000, 1100, 1200, 1500, 16000, 1800, 2000],
    'Bedrooms': [1, 2, 2, 2, 3, 3, 2, 4, 4],
    'Age': [1, 1, 3, 2, 1, 2, 2, 1, 2],
    'PriceIn100K': [5, 6, 7, 7, 70, 9, 9, 11,None]
}


In [115]:
df=pd.DataFrame(data)
df

Unnamed: 0,Area,Bedrooms,Age,PriceIn100K
0,850,1,1,5.0
1,900,2,1,6.0
2,1000,2,3,7.0
3,1100,2,2,7.0
4,1200,3,1,70.0
5,1500,3,2,9.0
6,16000,2,2,9.0
7,1800,4,1,11.0
8,2000,4,2,


In [116]:
print(df.isnull().sum())

Area           0
Bedrooms       0
Age            0
PriceIn100K    1
dtype: int64


In [117]:
print(df.isna().sum())

Area           0
Bedrooms       0
Age            0
PriceIn100K    1
dtype: int64


In [118]:
df=df.fillna(df.mean())

In [119]:
df

Unnamed: 0,Area,Bedrooms,Age,PriceIn100K
0,850,1,1,5.0
1,900,2,1,6.0
2,1000,2,3,7.0
3,1100,2,2,7.0
4,1200,3,1,70.0
5,1500,3,2,9.0
6,16000,2,2,9.0
7,1800,4,1,11.0
8,2000,4,2,15.5


In [120]:
# Data prep, find input , output
X=df[['Area','Bedrooms','Age']]
y=df['PriceIn100K']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

In [121]:
model=LinearRegression()

In [122]:
model.fit(X_train,y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [123]:
y_pred=model.predict(X_test)
print(y_pred)

[ 63.78691003 -42.84839412]


In [124]:
mae=mean_absolute_error(y_test,y_pred)
print(mae)

51.31765207152032


In [125]:
mse=mean_squared_error(y_test,y_pred)
print(mse)

2635.6601330716762


In [126]:
rmse=np.sqrt(mse)
print(rmse)

51.338680671319125


In [127]:
r2=r2_score(y_test,y_pred)

In [128]:
print(r2)

-657.9150332679191
