In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler
# from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error,r2_score
# from sklearn.linear_model import LinearRegression
# from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,ExtraTreesRegressor
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.svm import SVR,LinearSVR
import xgboost

In [4]:
# pip install xgboost

In [5]:
beerRating = pd.read_csv('beer_ratings.txt',sep='\t',header=None)
beerRating.columns = ['alcohol_by_weight','rating','bitterness','nitrogen','turbidity','sugars','degree_of_fermentation','calorific_value','density','pH','colour','sulphites']

In [6]:
#To check if there is presence of null values
beerRating.isna().sum()

alcohol_by_weight         0
rating                    0
bitterness                0
nitrogen                  0
turbidity                 0
sugars                    0
degree_of_fermentation    0
calorific_value           0
density                   0
pH                        0
colour                    0
sulphites                 0
dtype: int64

In [7]:
x = beerRating.drop(['rating'],axis=1).values
y = beerRating.iloc[:,1].values

In [8]:
bestFeatures = SelectKBest(score_func=chi2, k=11)
fit = bestFeatures.fit(x,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(beerRating.drop(['rating'],axis=1).columns)
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Feature','Score']
featureScores.nlargest(11,'Score')

Unnamed: 0,Feature,Score
5,degree_of_fermentation,987.82586
10,sulphites,592.930287
6,calorific_value,262.242733
4,sugars,167.537453
9,colour,164.197956
3,turbidity,61.431326
1,bitterness,29.844772
0,alcohol_by_weight,17.883476
2,nitrogen,13.697887
8,pH,0.363856


In [9]:
X = beerRating.drop(['density','pH'],axis=1).values
Y = beerRating.iloc[:,1].values

In [25]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=.2) 

In [26]:
# Standard Scalar
standardscalar = StandardScaler()
X_train = standardscalar.fit_transform(X_train)
X_test = standardscalar.fit_transform(X_test)

In [27]:
xgb = xgboost.XGBRegressor()

In [28]:
xgb.fit(X_train,Y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=12, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [29]:
xgbPrediction = xgb.predict(X_test)

In [30]:
print((mean_squared_error(Y_test,xgbPrediction))**0.5)
print(r2_score(Y_test,xgbPrediction))

0.10625079592707343
0.9998895463578135


In [38]:
abGridSearchCV = GridSearchCV(estimator=AdaBoostRegressor(),
                              param_grid={
                                  'learning_rate' : [0.01,0.05,0.1,0.3,0.7,1],
                                  'n_estimators': (10,50,100,200),
                                  'loss' : ['linear', 'square', 'exponential']},
                                  cv = 10
                              )

In [39]:
adrGridresult = abGridSearchCV.fit(X_train,Y_train)
abBestParameter = adrGridresult.best_params_

In [40]:
abr = AdaBoostRegressor(n_estimators=abBestParameter['n_estimators'],loss=abBestParameter['loss'],learning_rate=abBestParameter['learning_rate'])

In [41]:
abr.fit(X_train,Y_train)

AdaBoostRegressor(learning_rate=1, loss='square', n_estimators=100)

In [42]:
abrPrediction = abr.predict(X_test)

In [43]:
print((mean_squared_error(Y_test,abrPrediction))**0.5)
print(r2_score(Y_test,abrPrediction))

0.6158369718655365
0.9961300582273254


In [575]:
knn = KNeighborsClassifier(n_neighbors=1,p=2)
knn.fit(x_train,y_train)

KNeighborsClassifier(n_neighbors=1)

In [576]:
knnPrediction = knn.predict(x_test)

In [577]:
print(sqrt(mean_squared_error(y_test,knnPrediction)))
print(r2_score(y_test,knnPrediction))

8.911985611774965
0.11715081936565752


In [578]:
rf = RandomForestRegressor(n_estimators=200)

In [579]:
rf.fit(x_train,y_train)

RandomForestRegressor(n_estimators=200)

In [580]:
rfPrediction = rf.predict(x_test)

In [581]:
print(sqrt(mean_squared_error(y_test,rfPrediction)))
print(r2_score(y_test,rfPrediction))

6.98235936874108
0.4580715392719965


In [583]:
etrGSC = GridSearchCV(estimator=ExtraTreesRegressor(),
                      param_grid={
                      'max_depth': range(3,11),
                      'n_estimators': (10,50,100,500)},
                   cv=5, 
                   scoring='neg_mean_squared_error'
                  )

In [584]:
etrGridresult = etrGSC.fit(x_train,y_train)
etrBestParameter = etrGridresult.best_params_

In [585]:
etr = ExtraTreesRegressor(n_estimators=etrBestParameter['max_depth'],max_depth=etrBestParameter['max_depth'])

In [586]:
etr.fit(x_train,y_train)

ExtraTreesRegressor(max_depth=10, n_estimators=10)

In [587]:
etrPredict = etr.predict(x_test)

In [588]:
print(sqrt(mean_squared_error(y_test,etrPredict)))
print(r2_score(y_test,etrPredict))

7.333005240282795
0.4022748142618848


In [741]:
gsc = GridSearchCV(estimator=RandomForestRegressor(),
                  param_grid={
                      'max_depth': range(3,13),
                      'n_estimators': (10,50,100,500)},
                   cv=5, 
                   scoring='neg_mean_squared_error'
                  )

In [742]:
gridResult = gsc.fit(X_train,Y_train)

In [743]:
best_param = gridResult.best_params_

In [744]:
randomForest = RandomForestRegressor(max_depth=best_param['max_depth'],n_estimators=best_param['n_estimators'])

In [745]:
randomForest.fit(X_train,Y_train)

RandomForestRegressor(max_depth=12, n_estimators=50)

In [746]:
bestRFPrediction = randomForest.predict(X_test)

In [747]:
print(sqrt(mean_squared_error(Y_test,bestRFPrediction)))
print(r2_score(Y_test,bestRFPrediction))

1.0183967374720015
0.9899827426049183


In [44]:
svr_gsc = GridSearchCV(
        estimator=SVR(),
        param_grid={
            'kernel' : ('linear', 'poly', 'rbf', 'sigmoid'),
            'C': [0.1, 1, 100, 1000],
            'epsilon': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10,20],
        },
        cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

In [46]:
SVRGridResult = svr_gsc.fit(X_train,Y_train)
svrBestParams = SVRGridResult.best_params_

In [47]:
svr = SVR(kernel=svrBestParams['kernel'],C=svrBestParams['C'],epsilon=svrBestParams['epsilon'])

In [48]:
svr.fit(X_train,Y_train)

SVR(C=1, epsilon=0.0001, kernel='linear')

In [49]:
svrBestfitPrediction = svr.predict(X_test)

In [50]:
print((mean_squared_error(Y_test,svrBestfitPrediction))**0.5)
print(r2_score(Y_test,svrBestfitPrediction))

0.2049257064069634
0.9995714852299807


In [621]:
linearSVR_gsc = GridSearchCV(
        estimator=SVR(kernel='rbf'),
        param_grid={
            'C': [0.1, 1, 100, 1000],
            'epsilon': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10,20],
        },
        cv=5, scoring='neg_mean_squared_error', verbose=0, n_jobs=-1)

In [622]:
linearSVRGridResult = linearSVR_gsc.fit(x_train,y_train)
linearSVRBestParams = linearSVRGridResult.best_params_

In [623]:
linearSVR = SVR(kernel='rbf',C=svrBestParams['C'],epsilon=svrBestParams['epsilon'])

In [624]:
linearSVR.fit(x_train,y_train)

SVR(C=0.1, epsilon=1)

In [625]:
linearSVRBestfitPrediction = svr.predict(x_test)

In [626]:
print(sqrt(mean_squared_error(y_test,linearSVRBestfitPrediction)))
print(r2_score(y_test,linearSVRBestfitPrediction))

7.607610139864094
0.3566696176410946
