Thank You for visiting my notebook. In this notebook you will go-through  various machine learning models.

The following are the methods I have done
1.     Extrapolatory data analysis
2.     Finding the feature importance
3.     Standardization
4.     Decision Tree Classifier
5.     Random Forest Classifier
6.     SVM Classifier
7.     KNN
8.     XGBoost Classifier

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
print(os.listdir("../input"))

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
pd.pandas.set_option('display.max_columns',None)

In [None]:
df=pd.read_csv("../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv")

In [None]:
print(df.shape)

In [None]:
df.head()

# Exploratory data analysis

### checking missing values

In [None]:
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')

# Classifing GOOD and BAD using Label Encoding

In [None]:
df['quality']=pd.cut(df['quality'],bins=(2,6.5,8),labels=['Bad','Good'])

In [None]:
from sklearn.preprocessing import LabelEncoder
category=LabelEncoder()
df['quality']=category.fit_transform(df['quality'])

In [None]:
df.head()

In [None]:
X=df.iloc[:,:-1]
y=df.iloc[:,-1]

In [None]:
X.isnull()

In [None]:
y.isnull()

In [None]:
sns.pairplot(df)

In [None]:
df.corr()

In [None]:
corrmat=df.corr()
top_corr_features=corrmat.index
plt.figure(figsize=(7,7))
g=sns.heatmap(df[top_corr_features].corr(),annot=True,cmap='RdYlGn')

In [None]:
corrmat.index

# finding the feature importance

In [None]:
from sklearn.ensemble import ExtraTreesRegressor
model=ExtraTreesRegressor()
model.fit(X,y)

In [None]:
X.head()

In [None]:
print(model.feature_importances_)

In [None]:
feat_importances=pd.Series(model.feature_importances_,index=X.columns)
feat_importances.plot(kind='barh')
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=0)

# Standardization

### minmax scalar

In [None]:
from sklearn.preprocessing import MinMaxScaler
min_max=MinMaxScaler()

In [None]:
X_train=min_max.fit_transform(X_train)
X_test=min_max.fit_transform(X_test)

In [None]:
y_train.head()

# Decisiion tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dtree=DecisionTreeClassifier()

In [None]:
dtree.fit(X_train,y_train)

In [None]:
from sklearn.model_selection import cross_val_score
score=cross_val_score(dtree,X,y,cv=5)

In [None]:
score.mean()

### model evaluation

In [None]:
prediction1=dtree.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, prediction1))

### Hyperparameter tuning

In [None]:
DecisionTreeClassifier()

In [None]:
params={"splitter":["best","random"],
       "max_depth":[3,4,5,6,8,10,12,15],
       "min_samples_leaf":[1,2,3,4,5],
       "min_weight_fraction_leaf":[0.1,0.2,0.3,0.4],
       "max_features":["auto","log2","sqrt",None],
       "max_leaf_nodes":[None,10,20,30,40,50,60,70]}

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
random_search=GridSearchCV(dtree,param_grid=params,scoring="neg_mean_squared_error",n_jobs=-1,cv=10,verbose=3)

In [None]:
random_search.fit(X,y)

In [None]:
print(random_search.best_params_)

In [None]:
print(random_search.best_score_)

In [None]:
random_search.best_estimator_

In [None]:
dtree1=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=15, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.1, presort='deprecated',
                       random_state=None, splitter='best')

In [None]:
dtree1.fit(X_train,y_train)

In [None]:
from sklearn.model_selection import cross_val_score
score=cross_val_score(dtree,X,y,cv=5)

In [None]:
score.mean()

### model evaluation

In [None]:
predictions2=dtree1.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions2))

# Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier(n_estimators=200)

In [None]:
rfc.fit(X_train,y_train)

In [None]:
from sklearn.model_selection import cross_val_score
score=cross_val_score(rfc,X,y,cv=5)

In [None]:
score.mean()

### model evaluation

In [None]:
predictions3=rfc.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions3))

# Hyper parameter tuning

In [None]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV

In [None]:
random_grid={
    # Number of trees in random forest
    'n_estimators': [int(x) for x in np.linspace(start=100,stop=1200,num=12)],
    # Number of features to consider at every split         
    'max_features': ['auto','sqrt'],
    # Maximum number of levels in tree         
    'max_depth': [int(x) for x in np.linspace(5,30, num=6)],
    # Minimum number of samples required to split a node         
    'min_samples_split': [2,5,10,15,100],
    # Minimum number of samples required at each leaf node         
    'min_samples_leaf': [1,2,5,10]}

In [None]:
rf=RandomForestClassifier()

In [None]:
random_search=RandomizedSearchCV(rf,
                                 param_distributions=random_grid,
                                 scoring='neg_mean_squared_error',
                                 n_iter=100,
                                 cv=5,
                                 verbose=2,
                                 random_state=0,
                                 n_jobs=1)

In [None]:
random_search.fit(X_train,y_train)

In [None]:
random_search.best_params_

In [None]:
random_search.best_score_

In [None]:
random_search.best_estimator_

In [None]:
rdm_search=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=10, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=900,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
from sklearn.model_selection import cross_val_score
score=cross_val_score(rdm_search,X,y,cv=5)

In [None]:
score.mean()

### model evaluation

In [None]:
rdm_search.fit(X_train,y_train)

In [None]:
predictions4=rdm_search.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions4))

# SVM

In [None]:
from sklearn import svm
svm_model=svm.SVC() 

In [None]:
svm_model.fit(X_train,y_train)

In [None]:
from sklearn.model_selection import cross_val_score
score=cross_val_score(svm_model,X,y,cv=5)

In [None]:
score.mean()

### model evaluation

In [None]:
predictions5=svm_model.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions5))

# Hyper-Parameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn import svm

In [None]:
svm=svm.SVC()

In [None]:
param_grid={'C':[0.1,1,10,100,1000,10000],
           'kernel':('linear','rbf'),
           'gamma': [1,2,3,4,5,6,7,8,9,10]}

In [None]:
grid_search=GridSearchCV(svm,
                        param_grid=param_grid,
                        verbose=False,
                        scoring='accuracy',
                        cv=15,
                        n_jobs=-1)

In [None]:
grid_search.fit(X_train,y_train)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_score_

In [None]:
grid_search.best_estimator_

In [None]:
from sklearn import svm
svm1=svm.SVC(C=10, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=8, kernel='rbf', max_iter=-1,
    probability=False, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [None]:
svm1.fit(X_train,y_train)

In [None]:
from sklearn.model_selection import cross_val_score
score=cross_val_score(svm1,X,y,cv=5)

In [None]:
score.mean()

### model evaluation

In [None]:
predictions6=svm1.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions6))

# KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier() 

In [None]:
knn.fit(X_train,y_train)

In [None]:
from sklearn.model_selection import cross_val_score
score=cross_val_score(knn,X,y,cv=5)

In [None]:
score.mean()

### model evaluation

In [None]:
predictions7=knn.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions7))

# Hyper Parameter tuning

In [None]:
leaf_size=list(range(1,50))
n_neighbors=list(range(1,40))
p=[1,2]

In [None]:
params=dict(leaf_size=leaf_size,n_neighbors=n_neighbors,p=p)

In [None]:
knn_1=KNeighborsClassifier()

In [None]:
grd_search=GridSearchCV(knn_1,params,cv=10)

In [None]:
best_model=grd_search.fit(X_train,y_train)

In [None]:
best_model.best_params_

In [None]:
best_model.best_score_

In [None]:
predictions8=best_model.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions8))

# XGBoost Classifier

In [None]:
import xgboost
xg_model=xgboost.XGBClassifier()

In [None]:
xg_model.fit(X_train,y_train)

In [None]:
from sklearn.model_selection import cross_val_score
score=cross_val_score(xg_model,X,y,cv=5)

In [None]:
score.mean()

### model evaluation

In [None]:
predictions9=xg_model.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions9))

# Hyper Parameter Tuning

In [None]:
from sklearn.model_selection import RandomizedSearchCV
import xgboost

In [None]:
classifier=xgboost.XGBClassifier()

In [None]:
params1={"learning_rate":[0.05,0.10,0.15,0.20,0.25,0.30],
        "max_depth":[3,4,5,6,8,10,12,15],
        "min_child_weight":[1,3,5,7],
        "gamma":[0.0,0.1,0.2,0.3,0.4],
        "colsample_bytree":[0.3,0.4,0.5,0.7]}

In [None]:
rdm_search=RandomizedSearchCV(classifier,params1,n_iter=5,scoring='roc_auc',n_jobs=-1,cv=5,verbose=3)

In [None]:
rdm_search.fit(X_train,y_train)

In [None]:
rdm_search.best_estimator_

In [None]:
rdm_search.best_params_

In [None]:
classifier1=xgboost.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.3, gamma=0.0,
              learning_rate=0.05, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [None]:
classifier1.fit(X_train,y_train)

In [None]:
from sklearn.model_selection import cross_val_score
score=cross_val_score(classifier1,X,y,cv=5)

In [None]:
score.mean()

### model evaluation

In [None]:
predictions10=classifier1.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions10))