# Importing Libraries

In [463]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler

# Reading csv

In [464]:
df_train = pd.read_csv('../Class/machine-learning-june-2018/Class9/sales_data_training.csv')
df_test = pd.read_csv('../Class/machine-learning-june-2018/Class9/sales_data_test.csv')
print df_train.shape
print df_test.shape
df_train['critic_rating'] = df_train['critic_rating'].astype(str)
df_test['critic_rating'] = df_test['critic_rating'].astype(str)
df_train.head()
df_test.head()

(1000, 10)
(400, 10)


Unnamed: 0,critic_rating,is_action,is_exclusive_to_us,is_portable,is_role_playing,is_sequel,is_sports,suitable_for_kids,total_earnings,unit_price
0,3.5,1,1,1,0,1,0,1,247537,59.99
1,2.5,0,0,0,1,1,0,0,73960,59.99
2,3.5,0,0,0,0,1,1,0,82671,59.99
3,4.0,1,1,0,0,1,0,0,137456,39.99
4,2.0,1,0,1,0,1,0,0,89639,59.99


# Getting Training Data

In [465]:
train_data = df_train.values
x_train = df_train.drop(columns=['total_earnings'])
x_train = np.array(x_train)
y_train = np.array(train_data)[:,-2]
print x_train.shape
print y_train.shape

(1000, 9)
(1000,)


# Getting Testing Data

In [466]:
test_data = df_test.values
x_test = df_test.drop(columns=['total_earnings'])
y_test = np.array(test_data)[:,-2]
x_test = np.array(x_test)
print x_test.shape
print y_test.shape

(400, 9)
(400,)


## Random Forest

In [467]:
rf = RandomForestRegressor()
rf.fit(x_train,y_train)
print rf.score(x_test,y_test)

0.9835089102774014


## Linear Regression

In [468]:
lr = LinearRegression()
rf.fit(x_train,y_train)
print rf.score(x_test,y_test)

0.9827583499596478


## Cleaning The Data

In [469]:
rating_mapping = {
    '2.0': 0, 
    '2.5': 1,
    '3.0': 2,
    '3.5': 3,
    '4.0': 4,
    '4.5': 5,
    '5.0': 6
}
df_train.critic_rating = df_train.critic_rating.map(rating_mapping)
df_test.critic_rating = df_test.critic_rating.map(rating_mapping)

In [470]:
df_train.head()

Unnamed: 0,critic_rating,is_action,is_exclusive_to_us,is_portable,is_role_playing,is_sequel,is_sports,suitable_for_kids,total_earnings,unit_price
0,3,1,0,1,0,1,0,0,132717,59.99
1,5,0,0,0,0,1,1,0,83407,49.99
2,2,0,0,0,0,1,1,0,62423,49.99
3,5,1,0,0,0,0,0,1,69889,39.99
4,4,1,0,1,0,1,0,1,161382,59.99


In [471]:
df_test.head()

Unnamed: 0,critic_rating,is_action,is_exclusive_to_us,is_portable,is_role_playing,is_sequel,is_sports,suitable_for_kids,total_earnings,unit_price
0,3,1,1,1,0,1,0,1,247537,59.99
1,1,0,0,0,1,1,0,0,73960,59.99
2,3,0,0,0,0,1,1,0,82671,59.99
3,4,1,1,0,0,1,0,0,137456,39.99
4,0,1,0,1,0,1,0,0,89639,59.99


In [472]:
df_test.head()

Unnamed: 0,critic_rating,is_action,is_exclusive_to_us,is_portable,is_role_playing,is_sequel,is_sports,suitable_for_kids,total_earnings,unit_price
0,3,1,1,1,0,1,0,1,247537,59.99
1,1,0,0,0,1,1,0,0,73960,59.99
2,3,0,0,0,0,1,1,0,82671,59.99
3,4,1,1,0,0,1,0,0,137456,39.99
4,0,1,0,1,0,1,0,0,89639,59.99


# Getting clean Training Data

In [473]:
train_data = df_train.values
x_train = df_train.drop(columns=['total_earnings'])
x_train = np.array(x_train)
y_train = np.array(train_data)[:,-2]
print x_train.shape
print y_train.shape

(1000, 9)
(1000,)


# Getting clean Testing Data

In [474]:
test_data = df_test.values
x_test = df_test.drop(columns=['total_earnings'])
y_test = np.array(test_data)[:,-2]
x_test = np.array(x_test)
print x_test.shape
print y_test.shape

(400, 9)
(400,)


In [475]:
np.unique(df_train['critic_rating'].unique())

array([0, 1, 2, 3, 4, 5, 6])

# Random Forest on clean data

In [476]:
rf = RandomForestRegressor()
rf.fit(x_train,y_train)
print rf.score(x_test,y_test)

0.9832607941807786


# Linear Regression on clean data

In [477]:
r = LinearRegression()
rf.fit(x_train,y_train)
print rf.score(x_test,y_test)

0.9816832418608185


# Normalising Data

In [478]:
scaler = MinMaxScaler()

y_train = y_train.astype('float64')
y_test = y_test.astype('float64')
# scaler.fit(y_train)
y_train = scaler.fit_transform( np.reshape(y_train, (-1,1)) )
# y_train = scaler.transform( np.reshape( y_train, (-1,1) ) )
y_test = scaler.transform( np.reshape(y_test, (-1,1)) )
y_train = y_train[:,0]
y_test = y_test[:,0]
print y_train[:10]
print y_test[:10]

[0.37471396 0.19242528 0.11485185 0.14245208 0.48068243 0.13972015
 0.11338792 0.44906748 0.06127428 0.2066801 ]
[0.79917931 0.15750171 0.18970444 0.39223305 0.21546367 0.26756992
 0.24181069 0.70901832 0.43852794 0.39578936]


# Random Forest on normalised clean data

In [479]:
rf = RandomForestRegressor()
rf.fit(x_train,y_train)
print rf.score(x_test,y_test)

0.9808461029046007


# Linear Regression on normalised clean data

In [480]:
lr = LinearRegression()
rf.fit(x_train,y_train)
print rf.score(x_test,y_test)

0.9847786996373067


In [481]:
rf = RandomForestRegressor(n_estimators=12, max_depth=14)
rf.fit(x_train,y_train)
print rf.score(x_test,y_test)

0.9807277483942999


In [482]:
lr = LinearRegression(fit_intercept=True)
rf.fit(x_train,y_train)
print rf.score(x_test,y_test)

0.9849912051459502


In [483]:
lr = LinearRegression()
rf.fit(x_train,y_train)
print rf.score(x_test,y_test)

0.9833592537757834


## Applying ceil on unit_price

In [484]:
x_train = df_train.drop(columns=['total_earnings'])
temp3 = np.ceil( np.reshape(np.array(df_train.unit_price), (-1,1)) )
x_train['unit_price'] = temp3
x_train.head()

Unnamed: 0,critic_rating,is_action,is_exclusive_to_us,is_portable,is_role_playing,is_sequel,is_sports,suitable_for_kids,unit_price
0,3,1,0,1,0,1,0,0,60.0
1,5,0,0,0,0,1,1,0,50.0
2,2,0,0,0,0,1,1,0,50.0
3,5,1,0,0,0,0,0,1,40.0
4,4,1,0,1,0,1,0,1,60.0


In [485]:
x_test = df_test.drop(columns=['total_earnings'])
temp4 = np.ceil( np.reshape(np.array(df_test.unit_price), (-1,1)) )
x_test['unit_price'] = temp4
x_test.head()

Unnamed: 0,critic_rating,is_action,is_exclusive_to_us,is_portable,is_role_playing,is_sequel,is_sports,suitable_for_kids,unit_price
0,3,1,1,1,0,1,0,1,60.0
1,1,0,0,0,1,1,0,0,60.0
2,3,0,0,0,0,1,1,0,60.0
3,4,1,1,0,0,1,0,0,40.0
4,0,1,0,1,0,1,0,0,60.0


In [486]:
lr = LinearRegression(fit_intercept=True)
rf.fit(x_train,y_train)
print rf.score(x_test,y_test)

0.9861260764399821


In [487]:
rf = RandomForestRegressor()
rf.fit(x_train,y_train)
print rf.score(x_test,y_test)

0.9835912221059154


## Normalising ceiled unit_price

In [488]:
scaler1 = MinMaxScaler()

temp1 = scaler.fit_transform( np.reshape(np.array(df_train.unit_price), (-1,1)) )
# y_train = scaler.transform( np.reshape( y_train, (-1,1) ) )
temp2 = scaler.transform( np.reshape(np.array(df_test.unit_price), (-1,1)) )
x_train['unit_price'] = temp1
x_test['unit_price'] = temp2
x_train.head()

Unnamed: 0,critic_rating,is_action,is_exclusive_to_us,is_portable,is_role_playing,is_sequel,is_sports,suitable_for_kids,unit_price
0,3,1,0,1,0,1,0,0,1.0
1,5,0,0,0,0,1,1,0,0.5
2,2,0,0,0,0,1,1,0,0.5
3,5,1,0,0,0,0,0,1,0.0
4,4,1,0,1,0,1,0,1,1.0


In [489]:
x_test.head()

Unnamed: 0,critic_rating,is_action,is_exclusive_to_us,is_portable,is_role_playing,is_sequel,is_sports,suitable_for_kids,unit_price
0,3,1,1,1,0,1,0,1,1.0
1,1,0,0,0,1,1,0,0,1.0
2,3,0,0,0,0,1,1,0,1.0
3,4,1,1,0,0,1,0,0,0.0
4,0,1,0,1,0,1,0,0,1.0


In [503]:
rf = RandomForestRegressor()
rf.fit(x_train,y_train)
print rf.score(x_test,y_test)

0.9841814398107959


In [521]:
lr = LinearRegression(fit_intercept=True)
rf.fit(x_train,y_train)
print rf.score(x_test,y_test)

0.9852974010450033


In [500]:
lr = LinearRegression()
rf.fit(x_train,y_train)
print rf.score(x_test,y_test)

0.984959290248683
