# Python Programming Assignment: Linear Regression Exercise

In [74]:
# let's import the neccesary libraries
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [213]:
# let's read the dataset using pandas
df = pd.read_csv('assignment1.csv')

# let's preview the dataset
df.head()

Unnamed: 0,Month,Financial Year,Chain,Suburb,State,Postcode,Country,Manager,Category,Buyer,Sales
0,1/1/2016,2015/16,Fashions Direct,Chatswood,NSW,2067,Australia,Jeremy Garcia,Home,Richard Carr,479.02
1,1/1/2016,2015/16,Fashions Direct,Chatswood,NSW,2067,Australia,Jeremy Garcia,Juniors,Chester George,149.95
2,1/1/2016,2015/16,Fashions Direct,Chatswood,NSW,2067,Australia,Jeremy Garcia,Mens,Bruce Curran,14.0
3,1/1/2016,2015/16,Fashions Direct,Chatswood,NSW,2067,Australia,Jeremy Garcia,Juniors,Chester George,12.0
4,1/1/2016,2015/16,Fashions Direct,Chatswood,NSW,2067,Australia,Jeremy Garcia,Womens,Amy Buchanan,46.94


In [214]:
# let's explore the dataset shape in a statement
print("The dataset has {} rows and {} columns".format(df.shape[0], df.shape[1]))

The dataset has 72425 rows and 11 columns


In [215]:
# let's preview the tail of the dataset
df.tail()

Unnamed: 0,Month,Financial Year,Chain,Suburb,State,Postcode,Country,Manager,Category,Buyer,Sales
72420,1/7/2017,2017/18,Fashions Direct,Ballarat,VIC,3353,Australia,Jerry Fulton,Kids,Elizabeth Gentry,2335.83
72421,1/7/2017,2017/18,Fashions Direct,Ballarat,VIC,3353,Australia,Jerry Fulton,Kids,Elizabeth Gentry,3430.66
72422,1/7/2017,2017/18,Fashions Direct,Ballarat,VIC,3353,Australia,Jerry Fulton,Home,Richard Carr,4591.11
72423,1/7/2017,2017/18,Fashions Direct,Ballarat,VIC,3353,Australia,Jerry Fulton,Home,Richard Carr,3674.59
72424,1/7/2017,2017/18,Fashions Direct,Ballarat,VIC,3353,Australia,Jerry Fulton,Juniors,Chester George,3219.74


In [216]:
# let's replace white spaces in column names with underscores
df.columns = df.columns.str.replace(' ', '_')
df.columns

Index(['Month', 'Financial_Year', 'Chain', 'Suburb', 'State', 'Postcode',
       'Country', 'Manager', 'Category', 'Buyer', 'Sales'],
      dtype='object')

In [217]:
# let's drop the financial year column because it is not useful in this analysis
df = df.drop(columns=['Financial_Year'])

In [218]:
# let's check the datatypes
df.dtypes

Month        object
Chain        object
Suburb       object
State        object
Postcode      int64
Country      object
Manager      object
Category     object
Buyer        object
Sales       float64
dtype: object

In [219]:
# let's preview the dataset again
df.head()

Unnamed: 0,Month,Chain,Suburb,State,Postcode,Country,Manager,Category,Buyer,Sales
0,1/1/2016,Fashions Direct,Chatswood,NSW,2067,Australia,Jeremy Garcia,Home,Richard Carr,479.02
1,1/1/2016,Fashions Direct,Chatswood,NSW,2067,Australia,Jeremy Garcia,Juniors,Chester George,149.95
2,1/1/2016,Fashions Direct,Chatswood,NSW,2067,Australia,Jeremy Garcia,Mens,Bruce Curran,14.0
3,1/1/2016,Fashions Direct,Chatswood,NSW,2067,Australia,Jeremy Garcia,Juniors,Chester George,12.0
4,1/1/2016,Fashions Direct,Chatswood,NSW,2067,Australia,Jeremy Garcia,Womens,Amy Buchanan,46.94


In [220]:
# let's convert the month column into timestamp
df['Month'] = pd.to_datetime(df['Month'])

In [221]:
# let's check the datatypes again
df.dtypes

Month       datetime64[ns]
Chain               object
Suburb              object
State               object
Postcode             int64
Country             object
Manager             object
Category            object
Buyer               object
Sales              float64
dtype: object

In [222]:
# let's preview the adjusted dataset
df.head()

Unnamed: 0,Month,Chain,Suburb,State,Postcode,Country,Manager,Category,Buyer,Sales
0,2016-01-01,Fashions Direct,Chatswood,NSW,2067,Australia,Jeremy Garcia,Home,Richard Carr,479.02
1,2016-01-01,Fashions Direct,Chatswood,NSW,2067,Australia,Jeremy Garcia,Juniors,Chester George,149.95
2,2016-01-01,Fashions Direct,Chatswood,NSW,2067,Australia,Jeremy Garcia,Mens,Bruce Curran,14.0
3,2016-01-01,Fashions Direct,Chatswood,NSW,2067,Australia,Jeremy Garcia,Juniors,Chester George,12.0
4,2016-01-01,Fashions Direct,Chatswood,NSW,2067,Australia,Jeremy Garcia,Womens,Amy Buchanan,46.94


In [223]:
# let's select all categorical columns and store them in a variable
categorical_columns = df.select_dtypes(include=['object']).columns

In [224]:
# let's use label encoder to convert categorical columns to numeric
from sklearn.preprocessing import LabelEncoder

# let's create a label encoder object
le = LabelEncoder()

# let's loop through the categorical columns and apply label encoder
for column in categorical_columns:
    df[column] = le.fit_transform(df[column])

    # let's check the datatypes again

df.head()    

Unnamed: 0,Month,Chain,Suburb,State,Postcode,Country,Manager,Category,Buyer,Sales
0,2016-01-01,0,25,1,2067,0,6,2,8,479.02
1,2016-01-01,0,25,1,2067,0,6,5,2,149.95
2,2016-01-01,0,25,1,2067,0,6,7,1,14.0
3,2016-01-01,0,25,1,2067,0,6,5,2,12.0
4,2016-01-01,0,25,1,2067,0,6,9,0,46.94


In [225]:
# let's drop the month column
df = df.drop(columns=['Month'])

In [226]:
# let's select x and y variables
X = df.drop(columns=['Sales'])
y = df['Sales']


In [233]:
# train test split
v = 1000
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [228]:
# training the algorithm
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [229]:
# getting the intercept and coefficients
print("Intercept: ", lr.intercept_)
print("Coefficients: ", lr.coef_)

Intercept:  759.008937933726
Coefficients:  [-5.72890367e+02 -7.13759956e-01  2.50157664e+00 -1.11625850e-02
  1.49213975e-13  3.95371023e-01  3.95175947e+01  3.18758719e+01]


In [230]:
# making predictions
y_pred = lr.predict(X_test)

df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df.head()

Unnamed: 0,Actual,Predicted
10305,921.73,1048.461647
23724,44.91,1030.993463
19254,1132.06,428.658431
71978,2784.22,996.178568
51463,198.65,925.148351


In [231]:
# evaluating the model
from sklearn import metrics
print("Mean Absolute Error: ", metrics.mean_absolute_error(y_test, y_pred))
print("Mean Squared Error: ", metrics.mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error: ", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error:  795.7409827960855
Mean Squared Error:  1438172.6123618693
Root Mean Squared Error:  1199.2383467692605


In [234]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train,y_train)
r2_score = regressor.score(X_test,y_test)
print(r2_score*v,'%')

63.564756895295616 %
