# Simple Linear Regression Model to predict student scores based on study hours

In [None]:
# import packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [7]:
df = pd.read_csv('StudentStudyHour1.csv')
df.head(5)

Unnamed: 0,Hours,Scores
0,2.5,21.0
1,5.1,47.0
2,3.2,27.0
3,8.5,75.0
4,3.5,30.0


In [9]:
df.isnull().sum()

Hours     0
Scores    2
dtype: int64

In [10]:
# dropping rows which have null values
df.dropna(inplace=True,axis=0)
df.head(5)

Unnamed: 0,Hours,Scores
0,2.5,21.0
1,5.1,47.0
2,3.2,27.0
3,8.5,75.0
4,3.5,30.0


In [11]:
df.isnull().sum()

Hours     0
Scores    0
dtype: int64

In [13]:
df.shape #check the number of rows and columns

(26, 2)

In [17]:
#splits the dataset into dependent(y) and independent(x) variables
Y = df['Scores']
X = df.drop('Scores',axis=1)

In [20]:

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25)

In [22]:
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((19, 1), (7, 1), (19,), (7,))

In [17]:
print(x_train)  #just to check the values in test data

    Hours
26    2.6
24    7.8
25    1.3
22    3.8
11    5.9
21    4.8
2     3.2
7     5.5


In [18]:
print(y_test) #just to check the values in test data

26    30.0
24    86.0
25    19.0
22    35.0
11    62.0
21    54.0
2     27.0
7     60.0
Name: Scores, dtype: float64


In [23]:
# train the model using training data
lr = LinearRegression()
model = lr.fit(x_train,y_train)


In [24]:
#test the model using test data ( Pass only independent variable not dependent varible)
y_pred = model.predict(x_test)

# Just compare actual values and predicted values
df = pd.DataFrame({'y_test':y_test,'y_pred':y_pred})
df

Unnamed: 0,y_test,y_pred
26,30.0,28.743383
22,35.0,40.165954
0,21.0,27.791502
23,76.0,69.674263
12,41.0,46.82912
18,67.0,62.059215
21,54.0,49.684763


In [39]:
# Evaluate the model using MSE 
print("The MSE is : ", mean_squared_error(y_test,y_pred))
print("The R square is : ", r2_score(Y_test, y_test_predict))

The MSE is :  27.345269137410043
The R square is :  0.7333581857151947


# Multiple Linear Regression using Boston house price prediction 


This data was originally a part of UCI Machine Learning Repository and has been removed now. This data also ships with the scikit-learn library. 
There are 506 samples and 13 feature variables in this data-set. The objective is to predict the value of prices of the house using the given features.

The description of all the features is given below:

  **CRIM**: Per capita crime rate by town

  **ZN**: Proportion of residential land zoned for lots over 25,000 sq. ft

  **INDUS**: Proportion of non-retail business acres per town

  **CHAS**: Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)

  **NOX**: Nitric oxide concentration (parts per 10 million)

  **RM**: Average number of rooms per dwelling

  **AGE**: Proportion of owner-occupied units built prior to 1940

  **DIS**: Weighted distances to five Boston employment centers

  **RAD**: Index of accessibility to radial highways

  **TAX**: Full-value property tax rate per $10,000

  **B**: 1000(Bk - 0.63)², where Bk is the proportion of [people of African American descent] by town

  **LSTAT**: Percentage of lower status of the population

  **MEDV**: Median value of owner-occupied homes in $1000s


In [27]:
df=pd.read_csv("BostonHousing.csv")
df.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,d,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.9,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   crim     506 non-null    float64
 1   zn       506 non-null    float64
 2   indus    506 non-null    float64
 3   chas     506 non-null    int64  
 4   nox      506 non-null    float64
 5   rm       506 non-null    float64
 6   age      506 non-null    float64
 7   dis      506 non-null    float64
 8   rad      506 non-null    int64  
 9   tax      506 non-null    int64  
 10  ptratio  506 non-null    float64
 11  d        506 non-null    float64
 12  lstat    506 non-null    float64
 13  medv     506 non-null    float64
dtypes: float64(11), int64(3)
memory usage: 55.5 KB


In [30]:
# check for missing values in all the columns
df.isnull().sum()

crim       0
zn         0
indus      0
chas       0
nox        0
rm         0
age        0
dis        0
rad        0
tax        0
ptratio    0
d          0
lstat      0
medv       0
dtype: int64

In [33]:
X = df.drop('medv',axis=1)
Y = df['medv'] #This "medv" is taken as target price to predict

In [34]:
from sklearn.model_selection import train_test_split

# splits the training and test data set in 80% : 20%
# assign random_state to any value.This ensures consistency.
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=5)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(404, 13)
(102, 13)
(404,)
(102,)


In [35]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

lin_model = LinearRegression()
lin_model.fit(X_train, Y_train)

In [36]:
# model evaluation for training set

y_train_predict = lin_model.predict(X_train)
rmse = (np.sqrt(mean_squared_error(Y_train, y_train_predict)))
r2 = r2_score(Y_train, y_train_predict)

print("The model performance for training set")
print("--------------------------------------")
print('RMSE is {}'.format(rmse))
print('R2 score is {}'.format(r2))
print("\n")

The model performance for training set
--------------------------------------
RMSE is 4.747005747152538
R2 score is 0.7376761553975466




In [37]:
# model evaluation for testing set

y_test_predict = lin_model.predict(X_test)
# root mean square error of the model
rmse = (np.sqrt(mean_squared_error(Y_test, y_test_predict)))

# r-squared score of the model
r2 = r2_score(Y_test, y_test_predict)

print("The model performance for testing set")
print("--------------------------------------")
print('RMSE is {}'.format(rmse))
print('R2 score is {}'.format(r2))

The model performance for testing set
--------------------------------------
RMSE is 4.56907202818064
R2 score is 0.7333581857151947
