In [None]:
from pandas import read_csv,Series
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA

# Data reading 
data1 = read_csv("/content/drive/MyDrive/DBS Lectures /Beverage.csv")
# print(data1.head())
# print(data1.shape)
print(data1.info())
data1['quality'].unique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         4898 non-null   float64
 1   volatile acidity      4898 non-null   float64
 2   citric acid           4898 non-null   float64
 3   residual sugar        4898 non-null   float64
 4   chlorides             4898 non-null   float64
 5   free sulfur dioxide   4898 non-null   float64
 6   total sulfur dioxide  4898 non-null   float64
 7   density               4898 non-null   float64
 8   pH                    4898 non-null   float64
 9   sulphates             4898 non-null   float64
 10  alcohol               4898 non-null   float64
 11  quality               4898 non-null   object 
dtypes: float64(11), object(1)
memory usage: 459.3+ KB
None


array(['Normal', 'Excellent', 'Poor'], dtype=object)

In [None]:
# Data Encoding
data1['quality'] = data1['quality'].map({'Excellent':1, 'Normal':0, 'Poor':0})
print(data1.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         4898 non-null   float64
 1   volatile acidity      4898 non-null   float64
 2   citric acid           4898 non-null   float64
 3   residual sugar        4898 non-null   float64
 4   chlorides             4898 non-null   float64
 5   free sulfur dioxide   4898 non-null   float64
 6   total sulfur dioxide  4898 non-null   float64
 7   density               4898 non-null   float64
 8   pH                    4898 non-null   float64
 9   sulphates             4898 non-null   float64
 10  alcohol               4898 non-null   float64
 11  quality               4898 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 459.3 KB
None


In [None]:
# Dividing dataset into label and feature sets
X = data1.drop('quality', axis = 1) # Features
Y = data1['quality'] # Labels
print(X.shape)
print(Y.shape)

(4898, 11)
(4898,)


In [None]:
# Data Normalization
X_ = StandardScaler().fit_transform(X)

In [None]:
#Random Forest Classifier using grid search
RF_Classifier1 = Pipeline([('balancing', SMOTE(random_state = 101)),('classification', RandomForestClassifier(criterion='entropy', max_features='auto', random_state=1))])
no_trees = {'classification__n_estimators': [100, 150, 200, 250, 300]}
grid_search1 = GridSearchCV(estimator=RF_Classifier1, param_grid=no_trees, scoring='precision', cv=5)
grid_search1.fit(X_, Y)

best_parameters = grid_search1.best_params_
print(best_parameters)
best_result = grid_search1.best_score_ 
print(best_result)

{'classification__n_estimators': 100}
0.5405981710772059


In [None]:
#Random Forest Classifier using best parameter
RF_Classifier2 = RandomForestClassifier(n_estimators=100, criterion='entropy', max_features='auto', random_state=1)
RF_Classifier2.fit(X_,Y)
important_feature = Series(RF_Classifier2.feature_importances_, index=list(X)).sort_values(ascending=False)
print(important_feature)

alcohol                 0.145159
density                 0.122280
volatile acidity        0.090512
residual sugar          0.088370
chlorides               0.087347
pH                      0.084545
free sulfur dioxide     0.084179
total sulfur dioxide    0.084078
citric acid             0.075975
sulphates               0.072697
fixed acidity           0.064858
dtype: float64


In [None]:
# best feature
X2 = data1[['alcohol', 'density', 'volatile acidity']]
X2_ = StandardScaler().fit_transform(X_)

In [None]:
#Random Forest Classifier using grid search with the important features
RF_Classifier3 = Pipeline([('balancing', SMOTE(random_state = 101)), ('classification', RandomForestClassifier(criterion='entropy', max_features='auto', random_state=1) )])
no_trees = {'classification__n_estimators': [10, 20, 30, 40, 50]}
grid_search2 = GridSearchCV(estimator=RF_Classifier3, param_grid=no_trees, scoring='precision', cv=5)
grid_search2.fit(X_, Y)

best_parameters = grid_search2.best_params_
print(best_parameters)
best_result = grid_search2.best_score_ 
print(best_result)

{'classification__n_estimators': 50}
0.5309290922827856


In [None]:
# principal components Analysis (PCA)
pca = PCA(n_components = 6)
pca.fit(X_)
X_pca = pca.transform(X_)
print("Variance explained by each of the n_components: ",pca.explained_variance_ratio_)
print("Total variance explained by the n_components: ",sum(pca.explained_variance_ratio_))

Variance explained by each of the n_components:  [0.29293217 0.14320363 0.11106103 0.09259294 0.08848496 0.08534014]
Total variance explained by the n_components:  0.813614873394286


In [None]:
#Random Forest Classifier using principal components
RF_Classifier4 = Pipeline([('balancing', SMOTE(random_state = 101)),('classification', RandomForestClassifier(criterion='entropy', max_features='auto', random_state=1) )])
no_trees = {'classification__n_estimators': [100, 150, 200, 250, 300]}
grid_search3 = GridSearchCV(estimator=RF_Classifier4, param_grid=no_trees, scoring='recall', cv=5)
grid_search3.fit(X_pca, Y)

best_parameters = grid_search3.best_params_
print(best_parameters)
best_result = grid_search3.best_score_ 
print(best_result)

{'classification__n_estimators': 250}
0.5245283018867924
