In [58]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [59]:
pip list

Package                       Version
----------------------------- ------------
aiohttp                       3.8.1
aiosignal                     1.2.0
altair                        4.2.0
anyio                         3.5.0
argon2-cffi                   21.3.0
argon2-cffi-bindings          21.2.0
asttokens                     2.0.5
async-timeout                 4.0.2
attrs                         21.4.0
Babel                         2.9.1
backcall                      0.2.0
backports.functools-lru-cache 1.6.4
beautifulsoup4                4.10.0
black                         21.12b0
bleach                        4.1.0
Bottleneck                    1.3.2
bqplot                        0.12.32
branca                        0.4.2
brotlipy                      0.7.0
cachetools                    4.2.4
Cartopy                       0.18.0
certifi                       2021.10.8
cffi                          1.15.0
charset-normalizer            2.0.10
click                         7.1.2
clic

In [60]:
# This code is retrived from https://learnpython.com/blog/regression-analysis-in-python/
# To help Zinuo Chen's dissertation find out the impact factor based on his questionnaire data
fig_size = plt.rcParams['figure.figsize']
fig_size[0] = 6
fig_size[1] = 4
plt.rcParams['figure.figsize']

[6, 4]

In [61]:
sb_data = pd.read_csv (r'C:\Users\sgao5\Downloads\sbdata.csv')
sb_data.shape

(30, 7)

In [62]:
sb_data.head() # Returns first five rows of the data

Unnamed: 0,Obstruct,Environment,Unsanitary,Cheaper,Vitality,Safety,ToScore
0,0.0,0.4,0.0,0.8,1.0,0.2,0.6
1,0.2,0.4,0.0,0.6,1.0,0.4,0.6
2,0.4,0.4,0.2,0.4,0.8,0.2,0.8
3,0.4,0.4,0.4,0.4,0.4,0.4,0.8
4,0.2,0.2,0.2,0.2,0.8,0.4,0.6


In [63]:
sb_data.describe() # Returns different parameters

Unnamed: 0,Obstruct,Environment,Unsanitary,Cheaper,Vitality,Safety,ToScore
count,30.0,30.0,30.0,30.0,30.0,30.0,30.0
mean,0.206667,0.333333,0.22,0.466667,0.72,0.4,0.62
std,0.161743,0.191785,0.191905,0.176817,0.178885,0.174198,0.198963
min,0.0,0.0,0.0,0.2,0.4,0.0,0.2
25%,0.05,0.2,0.0,0.4,0.6,0.25,0.4
50%,0.2,0.4,0.2,0.4,0.8,0.4,0.6
75%,0.35,0.4,0.4,0.6,0.8,0.55,0.8
max,0.6,0.8,0.6,0.8,1.0,0.8,1.0


In [47]:
import cufflinks as cf
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
cf.go_offline()


In [48]:
# Do not use 'overall' as a parameter, it will cause error
x = sb_data[['Obstruct', 'Environment', 'Unsanitary', 'Cheaper', 'Vitality', 'Safety']]
y = sb_data['ToScore']

In [49]:
# Divide the data into train and test, which ensures the regression model is robust and can make good predictions on data that it has not encountered in the past
# 3/7 test/train
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.3, random_state = 42)

In [51]:
from sklearn.linear_model import LinearRegression
regrs = LinearRegression()
regrs.fit(x_train, y_train)

LinearRegression()

In [52]:
attributes_coefficients = pd.DataFrame(regrs.coef_, x.columns, columns = ['Coefficient'])
attributes_coefficients

Unnamed: 0,Coefficient
Obstruct,-0.001712
Environment,0.198766
Unsanitary,0.648557
Cheaper,0.335928
Vitality,0.358395
Safety,0.150285


In [53]:
# plot snippet of some of the comparisons between actual and predicted score
y_predict = regrs.predict(x_test)
comparison = pd.DataFrame({'Actual':y_test, 'Predicted':y_predict})
comparison

Unnamed: 0,Actual,Predicted
27,0.6,0.77238
15,0.6,0.539206
23,0.4,0.599308
17,0.6,0.504147
8,0.8,0.441621
9,0.4,0.43747
28,0.8,0.611086
24,0.2,0.269732
12,0.6,0.360588


In [55]:
# Test Linear Regression Model
# Knowledges from https://medium.com/analytics-vidhya/mae-mse-rmse-coefficient-of-determination-adjusted-r-squared-which-metric-is-better-cd0326a5697e
# Root mean square error (RMSE) is the square root of Mean Squared error. It measures the standard deviation of residuals.
# mean squared error (MSE) represents the average of the squared difference between the original and predicted values in the data set. It measures the variance of the residuals.
# mean absolute error (MAE) represents the average of the absolute difference between the actual and predicted values in the dataset. It measures the average of the residuals in the dataset.
# The lower value of MAE, MSE, and RMSE implies higher accuracy of a regression model. 
# RMSE should be less than 10% of the mean value of the predicted output.
from sklearn import metrics
print('MAE:' , metrics.mean_absolute_error(y_test, y_predict))
print('MSE:' , metrics.mean_squared_error(y_test, y_predict))
print('RMSE:' , np.sqrt(metrics.mean_squared_error(y_test, y_predict)))

MAE: 0.15802695580077983
MSE: 0.034447892670031025
RMSE: 0.18560143498914825


In [65]:
# Plot the graph Obstruct vs ToScore
sb_data.iplot(kind='scatter', x='Obstruct', y='ToScore', mode='markers', color = '#0C090A',  layout = {
        'title' :'Obstruct vs ToScore',
        'xaxis': {'title': 'Obstruct', 'type': 'log'},
        'yaxis': {'title': "ToScore"}
    })

In [66]:
# Plot the graph Env vs ToScore
sb_data.iplot(kind='scatter', x='Environment', y='ToScore', mode='markers', color = '#0C090A',  layout = {
        'title' :'Environment vs ToScore',
        'xaxis': {'title': 'Environment', 'type': 'log'},
        'yaxis': {'title': "ToScore"}
    })