In [65]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor


#Load the data
df = pd.read_csv(r"C:\Users\SAkela\lidl\candy-data.txt");

X = df.drop(["winpercent","competitorname", "sugarpercent", "pricepercent"],1)   #Feature Matrix
y = df["winpercent"]          #Target Variable
#print(df.head())

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test
#print(len(X_train),len(y_train),len(X_test),len(y_test))

In [67]:
#Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(X_train,y_train)
y_predictions = lin_reg.predict(X_train)
lin_mse = mean_squared_error(y_train,y_predictions)
lin_rmse = np.sqrt(lin_mse)
print("lin_rmse:", lin_rmse)
lin_scores = cross_val_score(lin_reg, X_train, y_train, scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
print("lin_rmse_scores:", lin_rmse_scores)
print("mean:", lin_rmse_scores.mean())
print("std:", lin_rmse_scores.std())
lin_r2sq = r2_score(y_train, y_predictions)
print("lin_r2sq:", lin_r2sq)

lin_rmse: 9.870064561805632
lin_rmse_scores: [15.0913503  14.0213426  11.42087153 10.86363055 12.14593423 14.53328966
  7.89933016 10.57402387  9.77534174 17.36189403]
mean: 12.368700867609387
std: 2.699932608965717
lin_r2sq: 0.5346699471984231


In [71]:
#Decision Tree
tree_reg = DecisionTreeRegressor()
tree_reg.fit(X_train,y_train)
y_prediction = tree_reg.predict(X_train)
tree_mse = mean_squared_error(y_train,y_prediction)
tree_rmse = np.sqrt(tree_mse)
print(tree_rmse)
tree_scores = cross_val_score(tree_reg, X_train, y_train, scoring="neg_mean_squared_error", cv=10)
tree_rmsescores = np.sqrt(-tree_scores)
print(tree_rmsescores)
print(tree_rmsescores.mean())
print(tree_rmsescores.std())
tree_r2sq = r2_score(y_train, y_prediction)
print("tree_r2sq:", tree_r2sq)

7.537707820926796
[12.60418858 10.02473645 15.86163639  7.58947722 12.74342193  8.83862852
  7.1374176  10.01178642 13.16681317 15.16391935]
11.31420256356239
2.8886841407551516
tree_r2sq: 0.728606330429213


In [70]:
#Random Forest
forest_reg = RandomForestRegressor()
forest_reg.fit(X_train, y_train)
y_predict = forest_reg.predict(X_train)
forest_mse = mean_squared_error(y_train,y_predict)
forest_rmse = np.sqrt(forest_mse)
print(forest_rmse)
forest_scores = cross_val_score(forest_reg, X_train, y_train, scoring="neg_mean_squared_error", cv=10)
forest_rmsescores = np.sqrt(-forest_scores)
print(forest_rmsescores)
print(forest_rmsescores.mean())
print(forest_rmsescores.std())
forest_r2sq = r2_score(y_train, y_predict)
print("forest_r2sq:", forest_r2sq)

7.83748114352828
[13.10106362  8.35267444 12.03611663  5.84164919  5.69873592 11.70394378
  7.2346102  11.10555383 11.92545166 15.56555405]
10.25653533026402
3.1315572623656136
forest_r2sq: 0.7065905260080279




In [62]:
#using GridSearch
param_grid = [
    {'n_estimators' : [3, 10, 20], 'max_features': [3, 6, 9]},
    {'bootstrap' : [False], 'n_estimators' : [3, 10], 'max_features' : [3, 4, 5]},
]
grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring="neg_mean_squared_error")
grid_search.fit(X_train,y_train)
print(grid_search.best_params_)


feature_importances = grid_search.best_estimator_.feature_importances_
print(feature_importances)

{'max_features': 9, 'n_estimators': 10}
[0.54806045 0.03384448 0.03485151 0.13992123 0.03390382 0.0479917
 0.01952354 0.06135801 0.08054526]




In [73]:
#Test data
final_model = grid_search.best_estimator_
y_test_predict = final_model.predict(X_test)
final_mse = mean_squared_error(y_test, y_test_predict)
final_rmse = np.sqrt(final_mse)
print(final_rmse)

11.773957913401292
