In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [None]:
# Step 1 - Understanding the dataset
wine = load_wine()
print("The dimensions of the dataset are : ",wine.data.shape)

In [None]:
# Step 2 - Converting the data into a dataframe
datadf = pd.DataFrame(wine.data)
datadf.columns = wine.feature_names
target_df = pd.DataFrame(wine.target)

In [None]:
target_df.head()

In [None]:
# Step 3 - Displaying dataframe head

datadf.head()

In [None]:
# Step 4 - Creating Distribution plot of variables

print("-----------------DISTRIBUTION PLOT-----------------")
plt.figure(1, figsize=(15,15))
plt.subplot(331)
sns.distplot(datadf["alcohol"], bins = 5).set_title("Distribution of Alcohol")
plt.subplot(332)
sns.distplot(datadf["malic_acid"], bins = 5).set_title("Distribution of malic_acid")
plt.subplot(333)
sns.distplot(datadf["ash"], bins = 5).set_title("Distribution of ash")
plt.subplot(334)
sns.distplot(datadf["alcalinity_of_ash"], bins = 5).set_title("Distribution of alcalinity_of_ash")
plt.subplot(335)
sns.distplot(datadf["magnesium"], bins = 5).set_title("Distribution of magnesium")
plt.subplot(336)
sns.distplot(datadf["total_phenols"], bins = 5).set_title("Distribution of total_phenols")
plt.subplot(337)
sns.distplot(datadf["flavanoids"], bins = 5).set_title("Distribution of flavanoids")
plt.subplot(338)
sns.distplot(datadf["nonflavanoid_phenols"], bins = 5).set_title("Distribution of nonflavanoid_phenols")
plt.subplot(339)
sns.distplot(datadf["proanthocyanins"], bins = 5).set_title("Distribution of proanthocyanins")

plt.suptitle("Distribution Plot of Variables")
plt.show()

In [None]:
plt.figure(1, figsize=(15,15))
plt.subplot(231)
sns.distplot(datadf["color_intensity"], bins = 5).set_title("Distribution of color_intensity")
plt.subplot(232)
sns.distplot(datadf["hue"], bins = 5).set_title("Distribution of hue")
plt.subplot(233)
sns.distplot(datadf["od280/od315_of_diluted_wines"], bins = 5).set_title("Distribution of od280/od315_of_diluted_wines")
plt.subplot(234)
sns.distplot(datadf["proline"], bins = 5).set_title("Distribution of proline")
plt.suptitle("Distribution Plot of Variables")
plt.show()

In [None]:
# Step 5 - Creating Heatmap of Correlations

print("-------------------------HEAT MAP-------------------------")
sns.heatmap(datadf.corr(), annot = True, cmap='coolwarm')
fig = plt.gcf()
fig.set_size_inches(10,8)
plt.title("Heatmap of Correlation Between Variables", fontsize=16)
plt.show()

In [None]:
# Step 6 - Regression Pair Plot 
print("----------------------------REGRESSION PAIR PLOT----------------------------")
sns.pairplot(datadf[['alcohol', 'malic_acid', 'ash','magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols',
                    'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']],
             diag_kind="hist", kind='reg')
plt.suptitle("Regression Pair Plot of Observations")
plt.show()

In [None]:
# Step 7 - Checking Missing Values

print("Missing Values :\n", datadf.isnull().sum())

In [None]:
# Step 8 - Assumptions Check

# Normality Assumptions: Determining Normality of Data

from scipy import stats
print("Skewness of Data \n", stats.skew(datadf))
print("Kurtosis of Data \n", stats.kurtosis(datadf))

In [None]:
# Step 9 - Determining Outliers by calculating Z-Score
import sys
np.set_printoptions(threshold=sys.maxsize)

zscore = np.abs(stats.zscore(datadf))
print("The Z-Score of the data is \n", zscore)

In [None]:
# Step 10 - Determining indexes where Z-score is greater than 3

outlierlist = np.where(zscore>3)
print("The indexes of the outliers are \n", outlierlist[0])
print("The number of outliers is \n", len(outlierlist[0]))

In [None]:
# Step 11 - Feature Extraction
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
datadf = sc.fit_transform(datadf)

In [None]:
# Setting Seed
np.random.seed(1000)

In [None]:
# Step 12 - Creating training, test dataset for dependent and independent variables
X_train, X_test, y_train, y_test = train_test_split(datadf, target_df, random_state = 10)

In [None]:
print("Dimension of training dataset: ", X_train.shape)
print("Dimension of test dataset: ", X_test.shape)

In [None]:
# Step 13 - Creating the Logistic Regression Model
wine_logreg_model = LogisticRegression()

In [None]:
# Step 14 - Fitting the model
wine_logreg_model.fit(X_train, y_train.values.ravel())

In [None]:
# Step 15 - Predicting with the trained model and determining the accuracy
pred = wine_logreg_model.predict(X_test)

In [None]:
# Step 16 - Determining the score of training and test dataset
print("Training set score :", wine_logreg_model.score(X_train, y_train))
print("Test set score :", wine_logreg_model.score(X_test, y_test))

In [None]:
# Step 16 - Determining accuracy of the model using confusion matrix
results = confusion_matrix(y_test, pred)
print("Result of the Confusion Matrix is: \n", results)