### This is a classification problem
### we will have to find the quality of wine given the various independent features

### Imports

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xbg
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report

In [None]:
df = pd.read_csv("../input/beginner-datasets/beginner_datasets/wine.csv")
df.head()

## EDA

In [None]:
df.info()

# looks like we dont have any null values 
# (sometimes some other values can be used instead of null (eg:- -999 or -1), so we'll use df.describe to check for such values)

In [None]:
df.describe()

# looks like we're good but we notice their are some values above the 75th percentile
# these might be outliers, so we'll check them

In [None]:
# lets do a box plot to check for outliers

In [None]:
plt.figure(figsize=(15,5))
df.iloc[:,:5].boxplot();

In [None]:
# use boxplot to see the outliers
for col in df.iloc[:,:-1].columns:
    sns.boxplot(x=df[col])
    plt.show()
    
# some of the first columns have a lot of outliers...
# for a classification problem we might not have to worry about them as we can use random forest which is unaffected by ouliers

# but ofcourse if we try to use logistic regression then we might need to do something about that...

In [None]:
# lets check their distributions

for col in df.iloc[:,:-1].columns:
    sns.distplot(df[col])
    plt.show()

In [None]:
# These distributions are actually quiet good
# we can apply a log transform to level them up a little bit

# lets try

for col in df.iloc[:,:-1].columns:
    sns.distplot(np.log1p(df[col]))     # log1p is good because log(0) is infinity so log1p adds 1 to every value
    plt.show()

In [None]:
# there are other transformations but, i currently need more knowledge about where all each performs best...
# log and boxcox have a good name...

### Feature engineering

In [None]:
X_log = df.drop(['quality'], axis=1).copy()
y = df['quality']

In [None]:
for col in X_log.iloc[:,:-1]:
    X_log[col] = np.log1p(X_log[col])

In [None]:
# now lets use label encoder on the last column 'type'

encoder = LabelEncoder()
X_log['type'] = encoder.fit_transform(X_log['type'])

In [None]:
# lets standardize it as well...

In [None]:
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X_log), columns=X_log.columns)

### Feature selection

In [None]:
data = pd.concat([X_scaled,y],axis=1)

In [None]:
cor = data.corr()
plt.figure(figsize=(10,10))
sns.heatmap(cor, annot=True, cmap='Greens');

### Their aren't that many correlations for quality...

### But for "type" there is a lot of correlation from independent features (thinking of taking it instead, maybe in the end)

In [None]:
### Lets use a sklearn library

In [None]:
from sklearn.feature_selection import mutual_info_classif

mutual_info_vals = mutual_info_classif(X_scaled,y)
mutual_val_df = pd.DataFrame({"vals":mutual_info_vals},index=X_scaled.columns) # we're keeping the passenger id
plt.figure(figsize=(10,5))
mutual_val_df.vals.sort_values(ascending=False).plot(kind='bar');


### this shows us the mutual information (infomation gain)

## lets try one which calculates linear relationship

In [None]:
from sklearn.feature_selection import f_classif

f_vals,_ = f_classif(X_scaled, y)
f_vals_df = pd.DataFrame({"vals":f_vals},index=X_scaled.columns) # we're keeping the passenger id
plt.figure(figsize=(10,5))
f_vals_df.vals.sort_values(ascending=False).plot(kind='bar');

In [None]:
mut_feat = mutual_val_df.vals.sort_values(ascending=False)[:7].index
lin_feat = f_vals_df.vals.sort_values(ascending=False)[:5].index

### model building

In [None]:
models = [LogisticRegression(), DecisionTreeClassifier(), RandomForestClassifier() ,xbg.XGBRFClassifier() ,SVC()]


In [None]:
result_cv = []
for model in models:
    result_cv.append((model,cross_val_score(model,X_scaled[mut_feat],y,cv=5)));

In [None]:
print(result_cv)

### I've seen in towardsdatascence post (by Terence shin ) that you can divide the wine category into 2 ( that is <7 is bad, >7 is good)
### we'll try that
### he also made subsets of data based on good and bad quality and saw how do they vary? ( use dot describe on subsets to see the means, max, mins of independent features... that was good)

In [None]:
X_scaled[mut_feat]

In [None]:
y_easy = np.where(y>7,1,0)

In [None]:
result_cv = []
for model in models:
    result_cv_easy.append(cross_val_score(model,X_scaled[mut_feat],y_easy,cv=5));

In [None]:
# lets just compute the average score

for vals in result_cv_easy:
    print(np.mean(vals))

In [None]:
# logistic and svc have preformed the best...

### Remember we had talked about the using the dataset to predict the wine type?
### let's go... we'll use the x_log version

In [None]:
y_type = X_log['type']
X_type = pd.concat([ X_log.drop(['type'], axis=1), y], axis=1)

In [None]:
result_cv_type = []
for model in models:
    result_cv_type.append(cross_val_score(model,X_scaled,y_easy,cv=5));

In [None]:
result_cv_type

In [None]:
for vals in result_cv_type:
    print(np.mean(vals))

### Further we would use some metrics to check the performance and so hyperparameter tuning...

In [None]:
# im just gonna be using it on good/bad wine prediction
# we'll go with logistic regression
# we will need y_predicted and y_test so lets perform train_test_split on it...

X_train,X_test,y_train,y_test = train_test_split(X_scaled[mut_feat] ,y_easy, test_size = 0.3, random_state=42)

In [None]:
log_clf = LogisticRegression()
log_clf.fit(X_train,y_train)
log_clf.score(X_test,y_test)

In [None]:
y_pred = log_clf.predict(X_test)

In [None]:
confusion_matrix(y_test,y_pred)

# now, the top right is false positive which means all good wines were acurately predicted...
# the bottom left one is the false negative which are the negative values that were falsely predicted...

### yeah... it classified everything as good and we had less bad wine in test set (50) so our accuracy was high...

### lets check other's

In [None]:
for model in models:
    model.fit(X_train,y_train)
    print(confusion_matrix(y_test,y_pred),end='\n\n')
    
# all are bad models...

In [None]:
print(classification_report(y_test,y_pred))

## We can use sampling techniques on this to make it predict both good and bad wines but this is my first notebook so like...           "Thank you"