In [None]:
#Custom Functions
'''
Null_value_imputations
1. show_percentage_values_missing(dataFrame:pd.DataFrame) Show all percentage values missing
2. missing_val_regression(df , feature_name)              Replace continuous variables
3. replace_all_empty_in_df(df :pd.DataFrame)              Replace all empty data in dataframe

Outlier handling
1. boxplot_plot(data:pd.DataFrame, bins:int, vert:bool, print_stats:bool)   Bin number allows us to automatically segment the boxplots
2. outlier_removal_by_std(data:pd.DataFrame,std_multi:int=3, inplace:bool=False)            Don't use if more than 20% dataloss
3. outlier_using_range(data:pd.DataFrame, inplace:bool=False)               If too much data loss, use this function instead

Density and correlation
1. get_density_plots(data:pd.DataFrame, suptitle:str)
2. correlation_fig(df:pd.DataFrame)

Kmeans
1. norm(a:list)
2. elbow_method(data:pd.DataFrame, max_range:int, title:str)
'''
import pandas as pd
import numpy as np
import scipy.stats as stat
import math
import matplotlib.pyplot as plt
from cycler import cycler
from sklearn.cluster import KMeans
import seaborn as sns
def show_percentage_values_missing(dataFrame:pd.DataFrame):
    missing_values = pd.concat([
        dataFrame.isnull().sum().sort_values(ascending = False) ,                                                  #Column 1
        dataFrame.isnull().sum().sort_values(ascending = False).apply(lambda x: (x / dataFrame.shape[0]) * 100) ,  #Column 2
        dataFrame.dtypes                                                                                           #Column 3
    ],
        axis = 1, #Increasing by column
        keys = ['Values missing', 'Percent of missing', 'Data type'])
    if dataFrame.isnull().sum().sum() == 0:
        print('\n==================================================\nThere are no null values present in this dataframe\n==================================================')
    return (missing_values[missing_values['Percent of missing']>0]) 

def missing_val_regression(df,feature):
    df_copy = df.copy()
    df_true = df_copy.dropna() #DF with non_nans
    
    #Define Null Colls and Categorical/Obj Colls
    null_columns = df.columns[df.isna().any()]
    obj_columns = df.select_dtypes(include='object').columns
    parameters = list(set(df.columns) - set(null_columns)- set(obj_columns))


    model = LinearRegression()
    model.fit(X = df_true[parameters],y = df_true[feature])

    df_copy.loc[df_copy[feature].isnull(), feature] = model.predict(df_copy[parameters])[df_copy[feature].isnull()]
    return df_copy

def replace_all_empty_in_df(df :pd.DataFrame):
    for missing_feature in reversed(df.columns[df.isna().any()]):
        df = missing_val_regression(df, missing_feature)
        print(show_percentage_values_missing(data))
    return df

#Outlier
def boxplot_plot(data:pd.DataFrame, bins:int, vert:bool, print_stats:bool=False):
    stds = data.std().to_list()
    if print_stats:
        print(stds)
        print(some_df)
    some_df = pd.DataFrame()
    some_df['range'] = pd.cut(data.describe().std(),bins = bins, labels=list(range(0,bins)))
    for i in set(some_df['range']):
        desired_column = some_df[some_df['range']==i].index
        df_new = pd.DataFrame(data[desired_column])
        df_new.plot.box(grid='True', vert=vert)
        
def outlier_removal_by_std(data:pd.DataFrame,std_multi:int=3, inplace:bool=False):
    df_copy = data if inplace else data.copy()
    data_shape_before = data.shape
    print('Before outlier removal   : ', data_shape_before)
    for name in df_copy.select_dtypes(exclude="object").columns:
        upper_limit = df_copy[name].mean() + std_multi*df_copy[name].std()
        lower_limit = df_copy[name].mean() - std_multi*df_copy[name].std()
        df_copy.drop(df_copy[(lower_limit>df_copy[name]) | (df_copy[name]>upper_limit)].index, inplace = True)
    print('After removal of outliers: ' ,df_copy.shape)
    print('Total percentage of data removed due to outliers', ((data_shape_before[0]-df_copy.shape[0])/data_shape_before[0])*100)
    print('With upperlimit = (mean+',std_multi,'x std)')

def outlier_using_range(data:pd.DataFrame, inplace:bool=False):
    df2_copy = data if inplace else data.copy
    for col in df2_copy.columns:
        if((df2_copy[col].max() - df2_copy[col].min()) > 12 ):
            df_test = pd.cut(df2_copy[col], bins = 8, labels=[1,2,3,4,5,6,7,8])
            new_name = col+'_RANGE'
            df2_copy[new_name] = df_test
            df2_copy.drop(columns=col, axis=1,inplace=True)
    
    df2_copy.head()


def get_density_plots(data:pd.DataFrame, suptitle:str):
    df_len = len(data.select_dtypes(exclude = "object").columns)
    rows = df_len//5 if df_len%5 == 0 else (df_len//5)+1
    
    #Axes
    fig, axes = plt.subplots(rows,5, figsize=(20,20))
    axes = axes.flatten()
    colors = plt.rcParams['axes.prop_cycle'].by_key()['color']
    fig.suptitle(suptitle, fontsize=20)

    count = 0
    for i,name in enumerate(data.select_dtypes(exclude = "object").columns):
        #Math of density function
        mean = data[name].mean()
        std = data[name].std()
        x_axis = sorted(data[name])
        y_axis = stat.norm.pdf(x_axis, data[name].mean(), data[name].std())
        
        if (i+3) > len(colors):
            colors=colors+colors
        #Plot histogram and lineplot in similar ax
        axes[i].plot(x_axis,y_axis,color=colors[i+2], linewidth=3)
        axes[i].hist(data[name], bins=20, rwidth=0.6, density=True,color=colors[i])
        axes[i].legend(['density','frequency'], 
                       bbox_to_anchor=(0., 1.02, 1., .102), loc='lower left', ncol=2, mode="expand", borderaxespad=0.)
        #Set x and y labels
        axes[i].set_xlabel(name)
        axes[i].set_ylabel("count")
        
        count=i
    #Delete unnecessary rows
    for x in range(count,len(axes)):
        axes.flat[x].remove()

    plt.show()

def correlation_fig(df:pd.DataFrame):
    #Compute the correlation matrix
    corr = df.corr()
    
    #Generate a mask for the upper triangle
    mask = np.triu(np.ones_like(corr,dtype=bool))
    
    #Set up the matplotlib figure
    fig, axes =  plt.subplots(figsize=(11,9))
    
    #Generate a custom diverging colormap
    cmap = sns.diverging_palette(230,20,as_cmap=True)
    
    #Draw the heatmap with the mask and correct aspect ratio
    sns.heatmap(corr,mask=mask,cmap=cmap,vmax=1, center=0,
               square=True, linewidths=.5, cbar_kws={"shrink":.5})
    
#Elbow methods
from sklearn.metrics import silhouette_score
from sklearn import preprocessing as pp

def norm(a:list):
    rtn = [(float(i)-min(a))/(max(a)-min(a)) for i in a]
    #print(rtn)
    return rtn

def elbow_method(data:pd.DataFrame, max_range:int, title:str):
    mean_square_error = []
    sil = []
    range_of_plot = range(1,max_range+1)
    for i in range_of_plot:
        km= KMeans(n_clusters=i, init='random', n_init=10, max_iter=300,random_state=0)
        label = km.fit(data)
        labels = km.labels_
        mean_square_error.append(km.inertia_)
        if i>1:
            sil.append(silhouette_score(data, labels, metric = 'euclidean')) 
    df = pd.DataFrame()
    df['range_of_k'] = range_of_plot
    df['mean_square'] = norm(mean_square_error)
    df['silhouette'] = [0]+norm(sil)

    #2 plots in 1 plot, or same axes
    ax = sns.lineplot(data=df,x="range_of_k",y="silhouette", label='Silhouette',marker='o')
    ax = sns.lineplot(data=df,x="range_of_k",y="mean_square",label = 'Elbow aka MeanSquare', marker='o')
    ax.set(xlabel="k", ylabel = "Score", title=title)
    index_of_max_silhouette = df['silhouette'].idxmax() 
    k_ideal = df['range_of_k'][index_of_max_silhouette]
    print('ideal number of k_cluster for ',title,'is: [',k_ideal,']')
    return k_ideal

In [None]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import numpy as np
import pandas as pd
wine_data = pd.read_csv('../input/wine-pca/Wine.csv')
wine_data.head()

In [None]:
#Summary of data
wine_data.describe()

### Dealing with null
There is no null values apparently

In [None]:
show_percentage_values_missing(wine_data)

### Handling Outliers


In [None]:
wine_data.plot.box(grid='True', vert=False)

In [None]:
boxplot_plot(wine_data,bins=90,vert=False)

In [None]:
#outlier_removal_by_std(wine_data,,inplace=True)
outlier_removal_by_std(data= wine_data,std_multi=2.5, inplace=True)

In [None]:
boxplot_plot(wine_data,bins=90,vert=False)

In [None]:
get_density_plots(wine_data,suptitle="Density and frequency plot")

In [None]:
correlation_fig(wine_data)

In [None]:
#Illustrating Flavanoids and Total_Phenols
sns.regplot(x=wine_data.Flavanoids, y=wine_data.Total_Phenols).set(title='Relationship between Phenols and Flavanoids')
plt.show()

#Dropping One of the features due to high correlation
wine_data.drop(['Total_Phenols'], axis=1, inplace=True)
'''
#Standard matplotlib
x = wine_data.Flavanoids
y = wine_data.Total_Phenols
m , c = np.polyfit(x,y,1)
plt.scatter(x=x,y=y,marker='o')
plt.title('Matplotlib')
plt.plot(x, m*x+c)
'''

In [None]:
#Normalize data
from sklearn.preprocessing import StandardScaler 
normalized_wine_data = StandardScaler().fit_transform(wine_data)


#Demonstration of normalized data
fig, axes = plt.subplots(1,2, figsize=(10,10))
axes = axes.flatten()

axes[0].scatter(x=wine_data.Alcohol, y=wine_data.Flavanoids, color='black')
axes[1].scatter(x= norm(wine_data.Alcohol) ,y=norm(wine_data.Flavanoids),color='black')

#Set Labels and titles
axes[0].set_title('Original data')
axes[0].set_xlabel('Alcohol')
axes[0].set_ylabel('Flavanoids')
axes[1].set_title('Normalized data')
axes[1].set_xlabel('Alcohol')
axes[1].set_ylabel('Flavanoids')

In [None]:
elbow_method(data=normalized_wine_data, max_range=10, title='Normalized k_means')

In [None]:
elbow_method(data=wine_data, max_range=10, title='Original/Non_normalized k_means')

In [None]:
X = normalized_wine_data

km = KMeans(n_clusters=3, init='random', n_init=10, max_iter=400,random_state=0)
label = km.fit_predict(X)
centroids= km.cluster_centers_

for i in list(set(label)):
    name='cluster'+str(i)
    sns.scatterplot(x=X[label==i,0], y=X[label==i,1], marker="o", label=name )

df_copy = wine_data.copy()
df_copy['label'] = pd.cut(label ,bins = 3, labels=['cluster0','cluster1','cluster2'])

In [None]:
iterate = len(df_copy.columns)
#print
sns.pairplot(df_copy, hue="label", markers=["o","s","D"], 
             x_vars= df_copy.columns[0:6],
             y_vars= df_copy.columns[0:6],
             corner= True)
sns.pairplot(df_copy, hue="label", markers=["o","s","D"],
             x_vars= df_copy.columns[6:12],
             y_vars= df_copy.columns[6:12],
             corner= True)