In [2]:
# common imports
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')

Put the data set in the same directory where you jupyter notebook is and import the data set

In [3]:
fileName = "JohnnyPiesData.csv"
#import the data and make a dataframe 

pie_df = pd.read_csv(fileName)

#you can check whether it is imported or not by printing few rows
pie_df.head()

Unnamed: 0,Example,Crust Shape,Crust Size,Crust Shade,Filling Size,Filling Shade,Class
0,ex1,Circle,Thick,Gray,Thick,Dark,pos
1,ex2,Circle,Thick,White,Thick,Dark,pos
2,ex3,Triangle,Thick,Dark,Thick,Gray,pos
3,ex4,Circle,Thin,White,Thin,Dark,pos
4,ex5,Square,Thick,Dark,Thin,White,pos


## Prepare the data for linear regression

In [4]:
# drop the example column from the dataset
pie_df.drop(['Example'], axis= 1, inplace= True)
# axis = 1 refers columns
#inplace = True means, update the dataframe after drop.

#you can check now 
pie_df.head()
#Now 'Example' column is not there.

Unnamed: 0,Crust Shape,Crust Size,Crust Shade,Filling Size,Filling Shade,Class
0,Circle,Thick,Gray,Thick,Dark,pos
1,Circle,Thick,White,Thick,Dark,pos
2,Triangle,Thick,Dark,Thick,Gray,pos
3,Circle,Thin,White,Thin,Dark,pos
4,Square,Thick,Dark,Thin,White,pos


#### One Hot Encoding using get_dummies()

In [5]:
pie_df_encoded = pd.get_dummies(pie_df,drop_first=True)

#you can check the encoded data
pie_df_encoded

Unnamed: 0,Crust Shape_Square,Crust Shape_Triangle,Crust Size_Thin,Crust Shade_Gray,Crust Shade_White,Filling Size_Thin,Filling Shade_Gray,Filling Shade_White,Class_pos
0,0,0,0,1,0,0,0,0,1
1,0,0,0,0,1,0,0,0,1
2,0,1,0,0,0,0,1,0,1
3,0,0,1,0,1,1,0,0,1
4,1,0,0,0,0,1,0,1,1
5,0,0,0,0,1,1,0,0,1
6,0,0,0,1,0,0,0,1,0
7,1,0,0,0,1,0,1,0,0
8,0,1,1,1,0,1,0,0,0
9,0,0,0,0,0,0,0,1,0


#### Extract the features 

In [6]:
features = pie_df_encoded.iloc[:,0:-1]  #extract untill the second last column

# check the type of features
print(type(features))

print(type(features.iloc[2,3]))

<class 'pandas.core.frame.DataFrame'>
<class 'numpy.uint8'>


#### Extract the class labels as response

In [7]:
response = pie_df_encoded.iloc[:,-1]

#check the type
print(type(response))

#Since it is a series, convert it into a dataframe
response = pd.DataFrame(response)

#Now check the type
print(type(response))

<class 'pandas.core.series.Series'>
<class 'pandas.core.frame.DataFrame'>


## Perform Linear Regression model fitting

In [8]:
# import the LinearRegression class
from sklearn.linear_model import LinearRegression

# define a model
reg_model = LinearRegression()

#train the model
reg_model.fit(features,response)

LinearRegression()

## Examine Linear Regression Model Parameters

In [9]:
# view the coefficients of the regression model
print(f'Coefficients of the Linear Regression model: \n {reg_model.coef_}')

# view the intercepts of the regression model
print(f'Intercept of the Linear Regression model: \n {reg_model.intercept_}')


Coefficients of the Linear Regression model: 
 [[-0.52586207 -0.83189655 -0.56465517 -0.63793103 -0.92672414  0.70258621
   0.12068966 -1.07327586]]
Intercept of the Linear Regression model: 
 [1.56034483]


## Making Predictions using the Linear Regression Model

In [10]:
# predict outputs for the training features
preds = reg_model.predict(features)

# check the data type of preds
print(type(preds))

<class 'numpy.ndarray'>


#### Response Comparision

In [11]:
# resp_comp = Response Comparision

resp_comp = response.copy()

reg_outputs = [float(reg_model.predict(np.reshape(row,(1,-1)))) for row in features.itertuples(index=False) ]  
#in the picture you uploaded, the last few words after feature.i____ are missing. Please check your origin picture and enter whatever was there. 
#Except this everything is okay and the provided solution will work.

predicted_resp = np.array([1 if reg_output > 0.5 else 0 for reg_output in reg_outputs])

resp_comp = resp_comp.assign(Regression_Predictions = reg_outputs )

resp_comp = resp_comp.assign(Predicted_Responses = predicted_resp)

resp_comp

Unnamed: 0,Class_pos,Regression_Predictions,Predicted_Responses
0,1,0.922414,1
1,1,0.633621,1
2,1,0.849138,1
3,1,0.771552,1
4,1,0.663793,1
5,1,1.336207,1
6,0,-0.150862,0
7,0,0.228448,0
8,0,0.228448,0
9,0,0.487069,0


## Calculate model accuracy

In [12]:
# import the accuracy_score() from sklearn
from sklearn.metrics import accuracy_score

# calculate the accuracy score 
# Following is the syntax to get the accuracy.
# acc_score = accuracy_score(y_pred = preds, y_true= response)

#But to calculate accuracy by this formula both 'preds' and 'response' must need to be in the same format.

# Lets check their format
print('Predictions:\n',preds)

print('Actual labels: \n', response)

Predictions:
 [[ 0.92241379]
 [ 0.63362069]
 [ 0.84913793]
 [ 0.77155172]
 [ 0.6637931 ]
 [ 1.3362069 ]
 [-0.15086207]
 [ 0.22844828]
 [ 0.22844828]
 [ 0.48706897]
 [ 0.10775862]
 [-0.07758621]]
Actual labels: 
     Class_pos
0           1
1           1
2           1
3           1
4           1
5           1
6           0
7           0
8           0
9           0
10          0
11          0


In [13]:
# As we can see predictions are just the numerical regression outputs. But we need them as  class lables to compare with the actual class labels.

# So lets convert the numerical regression output into class label as 
# if numerical_output > 0.5 then class = 1 otherwise class = 0
preds = np.array([1 if numerical_output > 0.5 else 0 for numerical_output in preds])

In [14]:
# Now calculate the accuracy score
acc_score = accuracy_score(y_pred = preds, y_true= response)

print(f'Accuracy Score of the model is: {acc_score} ')

Accuracy Score of the model is: 1.0 
