### Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import datasets
import plotly.express as px

### Modelos ####
from sklearn.decomposition import PCA
import warnings

### Read 

In [None]:
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

X_train = pd.read_csv("X_train.csv", sep = ";")
X_test = pd.read_csv("X_test.csv", sep = ";")

y_train = pd.read_csv("y_train.csv", sep = ";")
y_test = pd.read_csv("y_test.csv", sep = ";")

X_train.head()

In [None]:
X_train.columns

In [None]:
y_train.GR.unique()

In [None]:
y_train.GR.value_counts()

In [None]:
# X_train = X_train[['RFV', 'H2RFV', 'CONICITY', 'RRO']]

In [None]:
scaler = MinMaxScaler()#StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X_train))

In [None]:
from sklearn.feature_selection import SelectKBest, SelectPercentile, mutual_info_classif


In [None]:
selector = SelectKBest(mutual_info_classif, k=7)
X_reduced = selector.fit_transform(X_train, y_train)
X_reduced.shape


In [None]:
cols = selector.get_support(indices=True)
selected_columns = X_train.iloc[:,cols].columns.tolist()
selected_columns


In [None]:
from sklearn.feature_selection import mutual_info_classif
threshold = 10  # the number of most relevant features
high_score_features = []
feature_scores = mutual_info_classif(X_train, y_train, random_state=0)
for score, f_name in sorted(zip(feature_scores, X_train.columns), reverse=True)[:threshold]:
        print(f_name, score)
        high_score_features.append(f_name)
df_wine_norm_mic = X_train[high_score_features]
print(df_wine_norm_mic.columns)

## PCA

In [None]:
pca = PCA(n_components=4, copy=True, whiten=False, svd_solver='auto', tol=0.0, 
          iterated_power='auto', random_state=42)

pca.fit(X_scaled)

In [None]:
pca.explained_variance_ratio_

In [None]:
range_x = np.arange(4)

plt.bar(range_x,pca.explained_variance_ratio_)
plt.xticks(range_x, ('1st', '2nd', '3rd', '4th'))
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.title('Explained Variance')
plt.show()

plt.bar(range_x,np.cumsum(pca.explained_variance_ratio_))
plt.xticks(range_x, ('1st', '2nd', '3rd', '4th'))
plt.xlabel('Principal Component')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.title('Cumulative Explained Variance')
plt.show()

d = {'Component': np.r_[1:len(pca.explained_variance_ratio_)+1], 
     'Explained Variance': pca.explained_variance_ratio_, 
     'Cumulative Explained Variance': np.cumsum(pca.explained_variance_ratio_)}
df = pd.DataFrame(data=d)

print(df)

In [None]:
plt.figure(figsize = (10, 8))
plt.plot(range(1,5), pca.explained_variance_ratio_.cumsum(), marker = 'o', linestyle = '--')
plt.title("Explained Variance by Components")
plt.ylabel("Cumulative Explained Variance")

In [None]:
pca = PCA(n_components=2, random_state=42)

In [None]:
pca.fit(X_scaled)
# pca.fit(X_train)

In [None]:
pca.explained_variance_ratio_

In [None]:
score_pca = pca.fit_transform(X_scaled)
# score_pca = pca.fit_transform(X_train)

In [None]:
column_names = ["PC_1", "PC_2"]
score_pca = pd.DataFrame(score_pca, columns=column_names)
score_pca

In [None]:
d = pd.concat([score_pca,X_train, y_train.GR], axis=1)
d.head()

In [None]:
sns.set(color_codes=True)
sns.scatterplot(x=d['PC_1'], 
                y=d['PC_2'], 
                data=d, 
                hue=y_train.GR, hue_order = ["A", "B", "C"],  sizes=(100))
plt.show()

In [None]:
fig = plt.figure()
ax = plt.axes(projection='3d')

df = sns.load_dataset('penguins')
x=df['bill_length_mm']
y=df['bill_depth_mm']
z=df['body_mass_g']

ax.scatter(x, y, z)
plt.show()

In [None]:
def Scatterplot(data, var1, var2, cat):
    
    fig = px.scatter(data, x = var1, y = var2, color =cat, width = 800)
    fig.update_traces(marker=dict(size = 12,line = dict(width = 1)),selector=dict(mode = 'markers'))
    fig.update_layout(title = 'Scatterplot: ' + var1 + " vs " + var2)
    fig.update_xaxes(title = var1)
    fig.update_yaxes(title = var2)
    fig.show()

In [None]:
Scatterplot(d, "PC_1", "PC_2",  "GR")