## EDA

In [None]:
import warnings
import numpy as np 
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.feature_selection import f_classif
from sklearn.model_selection import train_test_split

In [None]:
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

X_train_2 = pd.read_csv("train_2.csv", sep = ",")
X_test_2 = pd.read_csv("test_2.csv", sep = ",")

# y_train = pd.read_csv("y_train.csv", sep = ";")
# y_test = pd.read_csv("y_test.csv", sep = ";")

X_train_2.head()

In [None]:
y_train = X_train_2[['Y']]
y_test = X_test_2[['Y']]

X_train = X_train_2[['RFV','H2RFV','CONICITY',
        'PLY','LFV','RRO', 'CAPSPLICE']]

X_test = X_test_2[['RFV','H2RFV','CONICITY',
       'PLY','LFV','RRO', 'CAPSPLICE']]

In [None]:
X_train.describe()

In [None]:
data = X_train.copy()
data['Y'] = y_train
data.head(2)

In [None]:
data.groupby("Y").describe()['LFV']

In [None]:
plt.figure(figsize=(6,6))
data['Y'].value_counts().plot.pie(autopct="%1.1f%%")

In [None]:
import seaborn as sns

# Histograma
sns.histplot(x = data["RFV"], hue = data["Y"])

# Equivalente a:
sns.histplot(x = "RFV", hue = "Y", data = data, palette = "Set1")

In [None]:
g = sns.kdeplot(data["RFV"][(data["Y"] == 0) & (data["RFV"].notnull())], color="Blue", shade = True)
g = sns.kdeplot(data["RFV"][(data["Y"] == 1) & (data["RFV"].notnull())], ax =g, color="Green", shade= True)
g.set_xlabel("RFV")
g.set_ylabel("Density")
g = g.legend(["A","BC"])

In [None]:
g = sns.kdeplot(data["H2RFV"][(data["Y"] == 0) & (data["H2RFV"].notnull())], color="Blue", shade = True)
g = sns.kdeplot(data["H2RFV"][(data["Y"] == 1) & (data["H2RFV"].notnull())], ax =g, color="Green", shade= True)
g.set_xlabel("H2RFV")
g.set_ylabel("Density")
g = g.legend(["A","BC"])

In [None]:
g = sns.kdeplot(data["CONICITY"][(data["Y"] == 0) & (data["CONICITY"].notnull())], color="Blue", shade = True)
g = sns.kdeplot(data["CONICITY"][(data["Y"] == 1) & (data["CONICITY"].notnull())], ax =g, color="Green", shade= True)
g.set_xlabel("CONICITY")
g.set_ylabel("Density")
g = g.legend(["A","BC"])

In [None]:
g = sns.kdeplot(data["PLY"][(data["Y"] == 0) & (data["PLY"].notnull())], color="Blue", shade = True)
g = sns.kdeplot(data["PLY"][(data["Y"] == 1) & (data["PLY"].notnull())], ax =g, color="Green", shade= True)
g.set_xlabel("PLY")
g.set_ylabel("Density")
g = g.legend(["A","BC"])

In [None]:
g = sns.kdeplot(data["LFV"][(data["Y"] == 1) & (data["LFV"].notnull())], color="Blue", shade = True)
g = sns.kdeplot(data["LFV"][(data["Y"] == 0) & (data["LFV"].notnull())], ax =g, color="Green", shade= True)
g.set_xlabel("LFV")
g.set_ylabel("Density")
g = g.legend(["A","BC"])

In [None]:
g = sns.kdeplot(data["RRO"][(data["Y"] == 0) & (data["RRO"].notnull())], color="Blue", shade = True)
g = sns.kdeplot(data["RRO"][(data["Y"] == 1) & (data["RRO"].notnull())], ax =g, color="Green", shade= True)
g.set_xlabel("RRO")
g.set_ylabel("Density")
g = g.legend(["A","BC"])

In [None]:
g = sns.kdeplot(data["CAPSPLICE"][(data["Y"] == 0) & (data["CAPSPLICE"].notnull())], color="Blue", shade = True)
g = sns.kdeplot(data["CAPSPLICE"][(data["Y"] == 1) & (data["CAPSPLICE"].notnull())], ax =g, color="Green", shade= True)
g.set_xlabel("CAPSPLICE")
g.set_ylabel("Density")
g = g.legend(["A","BC"])

In [None]:
warnings.filterwarnings('ignore')
orders = [0, 1]
g = sns.factorplot(y="RFV",x="Y",data=data,kind="box", order = orders)
g = sns.factorplot(y="H2RFV",x="Y", data=data,kind="box", order = orders)
g = sns.factorplot(y="CONICITY",x="Y",data=data,kind="box", order = orders)
g = sns.factorplot(y="PLY",x="Y", data=data,kind="box", order = orders)
g = sns.factorplot(y="LFV",x="Y", data=data,kind="box", order = orders)
g = sns.factorplot(y="RRO",x="Y", data=data,kind="box", order = orders)
g = sns.factorplot(y="CAPSPLICE",x="Y", data=data,kind="box", order = orders)

Correlations

In [None]:
X_train

In [None]:
corrmat = np.triu(X_train.corr(method='pearson'))

data_cor = X_train.copy()

g = sns.heatmap(data_cor.corr(method='pearson'),cmap="coolwarm",annot=True, mask = corrmat, fmt=".2f")

In [None]:
data_cor.corr(method='pearson')

In [None]:
top_num_features = data_cor.columns.to_list()
corrmat = data[top_num_features].corr()

plt.figure(figsize=(10,10))
sns.heatmap(corrmat);

### Mutual information

In [None]:
from sklearn.feature_selection import SelectKBest, SelectPercentile, mutual_info_classif

In [None]:
selector = SelectKBest(mutual_info_classif, k=7)
X_reduced = selector.fit_transform(X_train, y_train)
X_reduced.shape
cols = selector.get_support(indices=True)
selected_columns = X_train.iloc[:,cols].columns.tolist()
selected_columns

In [None]:
threshold = 10  # the number of most relevant features
high_score_features = []
feature_scores = mutual_info_classif(X_train, y_train, random_state=0)
for score, f_name in sorted(zip(feature_scores, X_train.columns), reverse=True)[:threshold]:
        print(f_name, score)
        high_score_features.append(f_name)
df_wine_norm_mic = X_train[high_score_features]
print(df_wine_norm_mic.columns)

In [None]:
def Scatterplot(data, var1, var2, cat):

    fig = px.scatter(data, x = var1, y = var2, color =cat, width = 800)
    fig.update_traces(marker=dict(size = 12,line = dict(width = 1)),selector=dict(mode = 'markers'))
    fig.update_layout(title = 'Scatterplot: ' + var1 + " vs " + var2)
    fig.update_xaxes(title = var1)
    fig.update_yaxes(title = var2)
    fig.show()

In [None]:
Scatterplot(data, "RFV", "RRO",  "Y")

In [None]:
g = sns.pairplot(data, hue="Y", palette="Set2", diag_kind="kde", height=2.5)

In [None]:
g = sns.pairplot(data, hue="Y", palette="Set2", diag_kind="hist")

In [None]:
fig = px.scatter_3d(data, x="H2RFV", y="CONICITY",z="RRO",
                    color= "Y")

fig.show()

In [None]:
fig = px.scatter_3d(data, x="H2RFV", y="CONICITY",z="RFV",
                    color= "Y")

fig.show()