In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Getting Started**

Title : Loan Status Prediction

Loan Status :

0 -- > Low Quality Wine

1 -- > Good Quality Wine 

In [None]:
#Importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
# loading the dataset to a pandas DataFrame
# Read .csv file into dataframe
data = pd.read_csv('/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')
data.head()

## **Exploratory data analysis**

In [None]:
#Shape of data 
print(data.shape)
#dtypes of data 
print(data.dtypes)

In [None]:
# Info of data
data.info()

In [None]:
# value_counts
data["quality"].value_counts()

In [None]:
# mean value ofred wine
data.groupby("quality").mean()

In [None]:
# describe the data
data.describe()

In [None]:
# missing_values
data.isnull().sum()

In [None]:
# number of values for each quality
sns.catplot(x='quality', data = data, kind = 'count')

In [None]:
# volatile acidity vs Quality
plot = plt.figure(figsize=(5,5))
sns.barplot(x='quality', y = 'volatile acidity', data = data)

In [None]:
# citric acid vs Quality
plot = plt.figure(figsize=(5,5))
sns.barplot(x='quality', y = 'citric acid', data = data)

**Correlation**

1.0   -->  Positive Correlation

-0.0  --> Negative Correlation

In [None]:
correlation = data.corr()
# constructing a heatmap to understand the correlation between the columns
plt.figure(figsize=(10,10))
sns.heatmap(correlation, cbar=True, square=True, fmt = '.1f', annot = True, annot_kws={'size':8}, cmap = 'Blues')

# **Data Transformation**

In [None]:
#label binarization
transform = data['quality'].apply(lambda y_value: 1 if y_value >= 7 else 0)
transform.head()

# **Model Preparation**


In [None]:
# separating the data and label
X = data.drop(['quality'], axis=1)
y = transform
print("The shape of X is " ,X.shape)
print("The shape of Y is " ,y.shape)

In [None]:
# Checking value counts again
y.value_counts()

In [None]:
# train_test_spilt
X_train, X_test, y_train, y_test = train_test_split(X,y, stratify=y , test_size=0.2, random_state=42)
print("The shape of X_train is", X_train.shape )
print("The shape of X_test is", X_test.shape)
print("The shape of y_train is", y_train.shape)
print("The shape of y_test is", y_test.shape)

In [None]:
# Checking again for value_counts
y_train.value_counts()

In [None]:
# Checking again for value_counts
y_test.value_counts()

#### **After stratify we have almost equal number of y_train & y_test values.**

# **Model Training**

We will train different model after the evaluation of model we will select out best model for production.

1.   SVM Model
2.   Logistic Regression
3.   Decision Tree
4.   Random Forest Regressor
5.   KNeighborsClassifier
6.   AdaBoost Classifier
7.   Xgb Boost Classifier

## **SVM model**

In [None]:
from sklearn import svm
classifier_model = svm.SVC(kernel='linear')
classifier_model.fit(X_train,y_train)

## **Logistic Regression Model**

In [None]:
from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression(max_iter=700)
logistic_model.fit(X_train,y_train)

# **Feature Scaling for Decision tree, Random Forest & boosting**

In [None]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_X_train = sc_X.fit_transform(X_train)
sc_X_test = sc_X.transform(X_test)

# **Decision Tree**

In [None]:
from sklearn.tree import DecisionTreeClassifier
decision_tree_model = DecisionTreeClassifier(random_state = 0)
decision_tree_model.fit(sc_X_train,y_train)

# **Hyper Parameter Tuning For DTC**


In [None]:
from sklearn.model_selection import GridSearchCV

grid_params = {
    'criterion' : ['gini', 'entropy'],
    'max_depth' : [3, 5, 7, 10],
    'min_samples_split' : range(2, 10, 1),
    'min_samples_leaf' : range(2, 10, 1)
}

grid_search = GridSearchCV(decision_tree_model, grid_params, cv = 5, n_jobs = -1, verbose = 1)
grid_search.fit(sc_X_train, y_train)

In [None]:
# best parameters and best score

print(grid_search.best_params_)
print(grid_search.best_score_)

# **KNeighborsClassifier**

In [None]:
from sklearn.neighbors import KNeighborsClassifier
k_model = KNeighborsClassifier(n_neighbors=35)
kfitModel = k_model.fit(sc_X_train, y_train)
print(kfitModel)

In [None]:
# finding optimal values for k
from sklearn.model_selection import cross_val_score
cross_valid_scores = []
for k in range(1, 100):
  knn = KNeighborsClassifier(n_neighbors = k)
  scores = cross_val_score(knn,X, y, cv = 10, scoring = 'accuracy')
  cross_valid_scores.append(scores.mean())    

print("Optimal k with cross-validation: \t",np.argmax(cross_valid_scores))

## **Random Forest model**

In [None]:
from sklearn.ensemble import RandomForestClassifier
modelRF = RandomForestClassifier()
modelRF.fit(sc_X_train,y_train)

# **Model Evaluation**
### **Model Evaluation Of SVM**

In [None]:
# accuracy score on training data

X_train_prediction = classifier_model.predict(X_train)
training_data_accuray = accuracy_score(X_train_prediction,y_train)
print('Accuracy of SVM model on training data : ', training_data_accuray)

# accuracy score on testing data

X_test_prediction = classifier_model.predict(X_test)
svm_test_data_accuray = accuracy_score(X_test_prediction,y_test)
print('Accuracy of SVM model on test data    : ', svm_test_data_accuray)

### **Model Evaluation of LGR**

In [None]:
# accuracy score on training data

X_train_prediction = logistic_model.predict(sc_X_train)
training_data_accuray = accuracy_score(X_train_prediction,y_train)
print('Accuracy of LGR model on training data  : ', training_data_accuray)

# accuracy score on testing data
X_test_prediction = logistic_model.predict(sc_X_test)
lgr_test_data_accuray = accuracy_score(X_test_prediction,y_test)
print('Accuracy of LGR model on test data      : ', lgr_test_data_accuray)

# **Model Evaluation of DTR after hypertuning**

In [None]:
dtc = grid_search.best_estimator_
y_pred = dtc.predict(sc_X_test)
dtc_train_acc = accuracy_score(y_train, dtc.predict(sc_X_train))
dtc_test_acc = accuracy_score(y_test, y_pred)

print(f"Training Accuracy of Decesion Tree Model  is {dtc_train_acc}")
print(f"Test Accuracy of Decesion Tree Model      is {dtc_test_acc}")

# **Visualization for DTR trees**

In [None]:
from sklearn import tree
plt.figure(figsize=(15,10))
tree.plot_tree(dtc,filled=True)

# **Model Evaluation of Random Forest**

In [None]:
# accuracy on test data
X_test_prediction = modelRF.predict(X_test)
kr_test_data_accuracy = accuracy_score(X_test_prediction, y_test)
print('Accuracy : ', kr_test_data_accuracy)

# **Model Evaluation of KNN**

In [None]:
kX_train_prediction = kfitModel.predict(sc_X_train)
training_data_accuray = accuracy_score(kX_train_prediction,y_train)
print('Accuracy on training data  : ', training_data_accuray)

# accuracy score on testing data
kX_test_prediction = kfitModel.predict(sc_X_test)
kx_lgr_test_data_accuray = accuracy_score(kX_test_prediction,y_test)
print('Accuracy on test data      : ', kx_lgr_test_data_accuray)

# **Boosting**

In [None]:
#Ada Boost Classifie
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier(base_estimator = dtc)

parameters = {
    'n_estimators' : [50, 70, 90, 120, 180, 200],
    'learning_rate' : [0.001, 0.01, 0.1, 1, 10],
    'algorithm' : ['SAMME', 'SAMME.R']
}

grid_search = GridSearchCV(ada, parameters, n_jobs = -1, cv = 5, verbose = 1)
grid_search.fit(X_train, y_train)

In [None]:
print(grid_search.best_params_)
print(grid_search.best_score_)

In [None]:
ada = AdaBoostClassifier(base_estimator = dtc, algorithm = 'SAMME', learning_rate = 1, n_estimators = 50)
ada.fit(sc_X_train, y_train)

ada_train_acc = accuracy_score(y_train, ada.predict(sc_X_train))
ada_test_acc = accuracy_score(y_test, y_pred)

print(f"Training Accuracy of Ada Boost Model is {ada_train_acc}")
print(f"Test Accuracy of Ada Boost Model is {ada_test_acc}")

In [None]:
# confusion matrix
from sklearn.metrics import confusion_matrix, classification_report
confusion_matrix(y_test, y_pred)

In [None]:
# classification report
print(classification_report(y_test, y_pred))

# **Xg Boost**

In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier(booster = 'gblinear', learning_rate = 1, max_depth = 3, n_estimators = 10)
xgb.fit(sc_X_train, y_train)

y_pred = xgb.predict(sc_X_test)

xgb_train_acc = accuracy_score(y_train, xgb.predict(sc_X_train))
xgb_test_acc = accuracy_score(y_test, y_pred)

print(f"Training Accuracy of XGB Model is {xgb_train_acc}")
print(f"Test Accuracy of XGB Model is {xgb_test_acc}")

# **Models Best Scores : -**

In [None]:
models = ['xg Boost','Ada Boost Classifier','Logistic Regression','KNN','SVC', 'Decision Tree', 'Random Forest']
scores = [xgb_test_acc,ada_test_acc,lgr_test_data_accuray,kx_lgr_test_data_accuray, svm_test_data_accuray, dtc_test_acc, kr_test_data_accuracy]
models = pd.DataFrame({'Model' : models, 'Score' : scores})
models

### ***Xg Boost Ada boost classifier have almost same value but DTC & KNN give us the best result we will use KNN for production. lets visualize best score more.***

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize = (18, 8))

sns.barplot(x = 'Model', y = 'Score', data = models)
plt.show()

# **Pridictive System for KNN.**

In [None]:
input_data = (7.5,0.5,0.36,6.1,0.071,17.0,102.0,0.9978,3.35,0.8,10.5)

# changing the input data to a numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the data as we are predicting the label for only one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = kfitModel.predict(input_data_reshaped)
print(prediction)

if (prediction[0]==1):
  print('Good Quality Wine')
else:
  print('Bad Quality Wine')

# **Pridictive System for DTC.**

In [None]:
input_data = (7.5,0.5,0.36,6.1,0.071,17.0,102.0,0.9978,3.35,0.8,10.5)

# changing the input data to a numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the data as we are predicting the label for only one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = decision_tree_model.predict(input_data_reshaped)
print(prediction)

if (prediction[0]==1):
  print('Good Quality Wine')
else:
  print('Bad Quality Wine')

### **If you find this notebook usefull please upvote.** ❤️