# 1- Environment Setup


## 1.1 - Libraries

In [1]:
# Importação das Bibliotecas
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import graphviz
from plotly.subplots import make_subplots
from collections import Counter
from datetime import datetime
from math import ceil
from scipy.cluster.hierarchy import dendrogram, linkage
import sqlite3
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, MultiLabelBinarizer
from sklearn.cluster import AgglomerativeClustering, KMeans, DBSCAN, estimate_bandwidth
from sklearn.metrics import silhouette_score, silhouette_samples, pairwise_distances, pairwise_distances_argmin_min, mean_squared_error, f1_score, accuracy_score, precision_score, recall_score, classification_report, r2_score
from sklearn.neighbors import NearestNeighbors
from sklearn.base import clone
from sklearn.manifold import TSNE
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LassoCV, LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.feature_selection import RFE
import sklearn as sk


## 1.2 - Import Dataset


In [None]:

data_path_train = 
data_path_test = 

df = pd.read_csv(data_path_train)
dft = pd.read_csv(data_path_test)



In [None]:
df.head()

In [None]:
## Copy the original dataset (so that we can make changes and the original remains intact)
df_original = df.copy()
df.shape

## 1.3 - Functions

In [None]:
## Measure the skewness of a dataframe

def calculate_skewness(df):

    skewness_dict = {}
    for column_name in df.columns:
        skewness = df[column_name].skew()
        skewness_dict[column_name] = skewness
    return skewness_dict

## Measure the kurtosis of a dataframe

def calculate_kurtosis(df):

    kurtosis_dict = {}
    for column_name in df.columns:
        kurtosis = df[column_name].kurtosis()
        kurtosis_dict[column_name] = kurtosis
    return kurtosis_dict


## 1.4 - Data Exploration

In [None]:
# Display the first few rows of the DataFrame 'df' to get a quick look at the data
df.head()

In [None]:
# Provides a concise summary of columns, datatypes, and the presence of null values
df.info()

In [None]:
# df.describe produces a statistical summary of all numerical columns. This was important for us to detect the both 'last_year_avg_charity' and 'weekly_exercise_hours'
# have negative values

df.describe().T

In [None]:
# Provides staistics for categorical variables (w/ datatype Object)
df.describe(include = ['O'])

# 2 - Train Set Preparation

### 2.0 - Determine initial metric and non_metric variables

In [None]:
#We define our variables as either metric or non_metric and assigned them to their repestive list.

metric = ['last_year_avg_monthly_charity_donations',
'environmental_awareness_rating',
'financial_wellness_index',
'investment_portfolio_value',
'investments_risk_appetite',
'investments_risk_tolerance',
'tech_savviness_score',
'social_media_influence_score',
'entertainment_engagement_factor',
'avg_monthly_entertainment_expenses',
'avg_weekly_exercise_hours',
'health_consciousness_rating',
'stress_management_score',
'overall_well_being',
'age',
'female']


non_metric = ['title',
'date_of_birth']


## 2.1 - Duplicated Rows

In [None]:
df_prep = df.copy()
df_prep.set_index('citizen_id', inplace=True)

In [None]:
# We dropped the duplicate rows
df_prep.drop_duplicates(inplace=True)
df_prep

In [None]:
# We readded the variable citizen_id, but it is based on the index position but +1, since in python the position starts at zero but we want id's to start at 1.
df_prep['citizen_id'] = df_prep.index + 1

In [None]:
#We replaced the index the new calculated citizen_id (At this stage our data frame doesn't include duplicates, and therefore citizen_id are unique for each row)
df_prep.set_index('citizen_id', inplace=True)
df_prep

## 2.2 - Modify variables (recodes)


In [None]:
# We changed the values in the 'title' column, if equal to Mr. change it to 0 otherwise 1, this way we can transform the title column into a colum
# that shows all the female as 1 and therefore can be used as an integer.
df_prep['female'] = df_prep['title'].map(lambda x: 0 if x == 'Mr.' else 1)

# We dropped the 'title' column since it will be replaced by the column female.
df_prep.drop(columns=['title'], inplace=True)




# Convert original date_of_birth variable to date type.
df_prep['date_of_birth'] = pd.to_datetime(df_prep['date_of_birth'])

# Calculate age from date_of_birth
basis_year = 2024
df_prep['age'] = basis_year - df_prep['date_of_birth'].dt.year

# Define the variable for the calculation
age = df_prep['age']





## Aqui, nós entendemos que os valores negativos de doações significavam erros. São uma pequena parcela da base, mas foram convertidos para absoluto
df_prep['last_year_avg_monthly_charity_donations']=df_prep['last_year_avg_monthly_charity_donations'].abs()
df_prep['avg_weekly_exercise_hours'] = df_prep['avg_weekly_exercise_hours'].abs()

## 2.3 - Missing Values

In [None]:
#df_prep[metric].isna().sum()

## 2.4 - Mean and Standard Deviation

In [None]:
#st = df_prep[metric].describe().T
#st

## 2.5 - Histograms and Boxplots



In [None]:
def plot_histograms_boxplots(df,feats):
    # Determine the number of rows needed for subplots
    num_columns = len(df[feats].columns)
    num_rows = num_columns  # Two plots per column (histogram and boxplot)

    # Create a figure with subplots
    fig, axes = plt.subplots(nrows=num_rows, ncols=2, figsize=(10, num_rows * 4))

    # Flatten the axes array for easy iteration
    axes = axes.flatten()

    # Iterate over each column and create a histogram and a boxplot
    for i, column in enumerate(df[feats].columns):
        # Histogram
        df[feats][column].plot(kind='hist', ax=axes[i * 2], title=f'Histogram of {column}')
        axes[i * 2].set_xlabel(column)

        # Boxplot
        df[feats][column].plot(kind='box', ax=axes[i * 2 + 1], title=f'Boxplot of {column}')

    # Adjust layout to prevent overlap
    plt.tight_layout()
    plt.show()

In [None]:
#plot_histograms_boxplots(df_prep,metric)

## 2.6 - Skewness and Curtosis

SKEWNESS & KURTOSIS

In [None]:
kurtosis_prep = calculate_kurtosis(df_prep[metric])
skewness_prep = calculate_skewness(df_prep[metric])

In [None]:
#kurtosis_prep

In [None]:
#skewness_prep


## 2.7 - Power Transformation (Yeo-Jonhson Transform - Skw and Krt)

In [None]:
from sklearn.preprocessing import PowerTransformer

pt = PowerTransformer(method='yeo-johnson')

# Fit the PowerTransformer on the DataFrame and transform the data
df_transformed = pd.DataFrame(pt.fit_transform(df_prep[metric]),columns=df_prep[metric].columns)

df_transformed

In [None]:
kurtosis_yj = calculate_kurtosis(df_transformed)
skewness_yj = calculate_skewness(df_transformed)

In [None]:
#kurtosis_yj



In [None]:
#skewness_yj


In [None]:
#plot_histograms_boxplots(df_transformed,metric)

## 2.8 - Normalization


In [None]:
scaler = StandardScaler()
df_norm = pd.DataFrame(scaler.fit_transform(df_transformed), columns=df_prep[metric].columns)
df_norm.index = df_prep.index
df_norm

### 2.8.1 - KNN Imputer

In [None]:
#O df_im vai possuir somente as medidas. Não haverá variáveis categóricas.
#Esse bloco demora muito para correr. -> É normal
imputer = KNNImputer(n_neighbors=4)
df_im = pd.DataFrame(imputer.fit_transform(df_norm[metric]))
df_im.columns = df_prep[metric].columns
df_im.index = df_prep[metric].index
df_im.isna().sum()

In [None]:
kurtosis_norm = calculate_kurtosis(df_norm)
skewness_norm = calculate_skewness(df_norm)

In [None]:
kurtosis_norm
# Executive Summary: Kurtosis Interpretation

# Leptokurtic Variables (Kurtosis > 3):
# - last_year_avg_monthly_charity_donations, financial_wellness_index,
#   overall_well_being, entertainment_engagement_factor:
#   Distributions have heavier tails and sharper peaks than a normal distribution.

# Mesokurtic Variables (Kurtosis ≈ 3):
# - environmental_awareness_rating, investment_portfolio_value,
#   health_consciousness_rating, stress_management_score, age:
#   Distributions are approximately similar to a normal distribution in terms of tail heaviness.

# Platykurtic Variables (Kurtosis < 3):
# - investments_risk_appetite, social_media_influence_score,
#   avg_monthly_entertainment_expenses, avg_weekly_exercise_hours, female:
#   Distributions have lighter tails and a flatter peak than a normal distribution.


In [None]:
skewness_norm
# Executive Summary: Skewness Interpretation

# Highly Positively Skewed Variables (Skewness > 1):
# - last_year_avg_monthly_charity_donations, avg_weekly_exercise_hours:
#   Distributions are highly skewed to the right.

# Moderately Positively Skewed Variables (0.5 < Skewness < 1):
# - overall_well_being: Distribution is moderately skewed to the right.

# Mildly Positively Skewed Variables (0 < Skewness < 0.5):
# - environmental_awareness_rating, financial_wellness_index,
#   investments_risk_tolerance, tech_savviness_score,
#   entertainment_engagement_factor, avg_monthly_entertainment_expenses,
#   health_consciousness_rating, stress_management_score, age:
#   Distributions are mildly skewed to the right.

# Close to Symmetric Variables (Skewness ≈ 0):
# - investment_portfolio_value, social_media_influence_score, female:
#   Distributions are approximately symmetric.
# - social_media_influence_score, female: Distributions are approximately symmetric.


In [None]:
#tests for normality of the attributes
from scipy.stats import kstest
for column in df_norm.columns:
    data = df_norm[column]  # Extract the data from the DataFrame column
    statistic, p_value = kstest(data, 'norm')  # Perform Kolmogorov-Smirnov test against a normal distribution
    print(f"Variable: {column}")
    print(f"Kolmogorov-Smirnov Test - Statistic: {statistic}, p-value: {p_value}")

    # Interpret the test result
    alpha = 0.05
    if p_value > alpha:
        print("Sample looks Gaussian (fail to reject H0)")
    else:
        print("Sample does not look Gaussian (reject H0)")
    print("\n")


In [None]:
#tests for normality of the attributes
from scipy.stats import anderson
for column in df_norm.columns:
    data = df_norm[column]  # Extract the data from the DataFrame column
    result = anderson(data)  # Perform Anderson-Darling test
    statistic = result.statistic
    critical_values = result.critical_values
    significance_level = result.significance_level

    print(f"Variable: {column}")
    print(f"Anderson-Darling Test - Statistic: {statistic}")
    print(f"Critical Values: {critical_values}")
    print(f"Significance Level: {significance_level}")

    # Interpret the test result
    alpha = 0.05
    if statistic < critical_values[2]:
        print("Sample looks Gaussian (fail to reject H0)")
    else:
        print("Sample does not look Gaussian (reject H0)")
    print("\n")


## 2.9 - DBScan - Outliers






In [None]:
def plot_kdist_graph(df, feats, n_neighbors):
    """
    Plots the K-distance graph to determine the right epsilon (eps) value for DBSCAN clustering.

    Parameters:
    - df: DataFrame containing the data points.
    - feats: List of feature columns to consider for distance calculation.
    - n_neighbors: Number of nearest neighbors to consider for average distance calculation.

    This function calculates the average distance to the n_neighbors for each data point
    and plots the sorted distances to help identify an appropriate epsilon value for DBSCAN clustering.

    Args:
        df (DataFrame): The data frame containing the data points.
        feats (list): List of feature columns to consider for distance calculation.
        n_neighbors (int): Number of nearest neighbors to consider for average distance calculation.

    Returns:
        None
    """
    neigh = NearestNeighbors(n_neighbors=n_neighbors)
    neigh.fit(df[feats])
    distances, _ = neigh.kneighbors(df[feats])

    # We sort the average distances of the points and plot this
    distances = np.sort(distances[:, -1])
    plt.ylabel("%d-NN Distance" % n_neighbors)
    plt.xlabel("Points sorted by distance")
    plt.plot(distances)
    plt.title("DBScan - Kdist")
    plt.show()

In [None]:
## Esse aqui demora muito também
#plot_kdist_graph(df_im, metric,n_neighbors = 10)

In [None]:
## Create a DBSCAN clustering model with specified hyperparameters
dbscan = DBSCAN(eps=2.5, min_samples=10, n_jobs=4)

## Fit the DBSCAN model to the data and predict cluster labels
dbscan_labels = dbscan.fit_predict(df_im[metric])

## Print information about the clustering results
print("Points in cluster -1 are noise rows.")
print("Counter of cluster labels     :", Counter(dbscan_labels))
print("Percentage of noise rows      :", round(100*Counter(dbscan_labels)[-1]/df.shape[0],2))

In [None]:
def split_noise_rows(df, feats, dbs_model):
  ## Predict cluster labels using the provided DBSCAN model
  dbscan_labels = dbs_model.fit_predict(df[feats])

  ## Concatenate cluster labels with the original DataFrame
  df_concat = pd.concat([df,
                         pd.Series(dbscan_labels,
                                   name='dbscan_labels',
                                   index=df.index)],
                            axis=1)

  ## Create separate DataFrames for noise and non-noise rows
  df_noise = df_concat[df_concat['dbscan_labels']==-1].copy()
  df_nonoise = df_concat[df_concat['dbscan_labels']== 0].copy()

  return df_noise, df_nonoise, df_concat

In [None]:
df_im_with_target = df_im.copy()
#Reintegrate target to implement RFE
df_im_with_target['lifestyle_type'] = df_prep['lifestyle_type']

In [None]:
df_noise, df_nonoise, df_concat = split_noise_rows(df_im_with_target, metric, dbscan)

# 3 - Test Set Preparation

### 3.1 - No Duplicated Rows

## 3.2 - Replace Index

In [None]:
dft_prep = dft.copy()
dft_prep.set_index('citizen_id', inplace=True)
dft_prep.shape

## 3.3 - Modify variables

In [None]:
#Title to female
dft_prep['female'] = dft_prep['title'].map(lambda x: 0 if x == 'Mr.' else 1)
dft_prep.drop(columns=['title'], inplace=True)

In [None]:
# Convert original date_of_birth variable to date type.
dft_prep['date_of_birth'] = pd.to_datetime(dft_prep['date_of_birth'])

# Calculate age from date_of_birth, fixing 2024 as year.
basis_year = 2024
dft_prep['age'] = basis_year - dft_prep['date_of_birth'].dt.year

# Define the variable for the calculation
age = dft_prep['age']

In [None]:
## Aqui, nós entendemos que os valores negativos de doações significavam erros. São uma pequena parcela da base, mas foram convertidos para absoluto
dft_prep['last_year_avg_monthly_charity_donations']=dft_prep['last_year_avg_monthly_charity_donations'].abs()
dft_prep['avg_weekly_exercise_hours'] = dft_prep['avg_weekly_exercise_hours'].abs()

## 3.4 - Missing Values

In [None]:
dft_prep.isna().sum()

## 3.5 - Histograms and Boxplots

In [None]:
#plot_histograms_boxplots(dft_prep,metric)

## 3.6 - Skewness and Kurtosis

In [None]:
kr_test = calculate_kurtosis(dft_prep[metric])
sk_test = calculate_skewness(dft_prep[metric])

In [None]:
#kr_test

In [None]:
#sk_test

## 3.7 - Power Transformation

In [None]:
pt = PowerTransformer(method='yeo-johnson')

# Fit the PowerTransformer on the DataFrame and transform the data
dft_transformed = pd.DataFrame(pt.fit_transform(dft_prep[metric]),columns=dft_prep[metric].columns)

dft_transformed

## 3.8 - Normalization

In [None]:
scaler = StandardScaler()
dft_norm = pd.DataFrame(scaler.fit_transform(dft_transformed), columns=dft_prep[metric].columns)
dft_norm.index = dft_prep.index
dft_norm

## 3.9 - KNN Imputer


In [None]:
#O df_im vai possuir somente as medidas. Não haverá variáveis categóricas.
#Esse bloco demora muito para correr. -> É normal
imputer = KNNImputer(n_neighbors=4)
dft_im = pd.DataFrame(imputer.fit_transform(dft_norm[metric]))
## Set correct column names and index
dft_im.columns = dft_prep[metric].columns
dft_im.index = dft_prep[metric].index

#Valida imputer
#dft_im.isna().sum()

# 4 - Feature Selection

### 4.1 - Spearman Correlation

In [None]:

def plot_corr_matrix(df, metric_variables, title="Correlation Matrix", method = "spearman", figsize=(10,8)):

  # Prepare figure
  figsize = (10*2, 8*2)
  fig = plt.figure(figsize=figsize)

  ## Calculate the correlation matrix for the specified metric variables rounded to 2 decimal places
  corr = np.round(df[metric_variables].corr(method=method), decimals=2)

  ## Create an annotation matrix to selectively display highly correlated values (>|0.5|)
  mask_annot = np.absolute(corr.values) >= 0.5
  ## Use np.where() to conditionally fill matrix elements for annotation display
  annot = np.where(mask_annot, corr.values, np.full(corr.shape,"")) # Try to understand what this np.where() does

  ## Plot the heatmap representing the correlation matrix
  sns.heatmap(data=corr, annot=annot, cmap=sns.diverging_palette(220, 10, as_cmap=True),
              fmt='s', vmin=-1, vmax=1, center=0, square=True, linewidths=.5)

  ## Adjust the layout for better visualization
  fig.subplots_adjust(top=0.95)
  fig.suptitle(title, fontsize=20)

  ## Display the plot
  plt.show()


  return

In [None]:
#plot_corr_matrix(df_nonoise, metric)

Based on Spearman correlation, we have excluded variables entertainment_engagement_factor and overall_well_being.
We have made this decision because entertainment engagement is highly correlated with two variables (stress management and avg_monthly entertainment) and overall_well_being is more difficult to explain/measure in a realistic framework.

In [None]:
metric_new = ['last_year_avg_monthly_charity_donations',
'environmental_awareness_rating',
'financial_wellness_index',
'investment_portfolio_value',
'investments_risk_appetite',
'investments_risk_tolerance',
'tech_savviness_score',
'social_media_influence_score',
#'entertainment_engagement_factor',
'avg_monthly_entertainment_expenses',
'avg_weekly_exercise_hours',
'health_consciousness_rating',
'stress_management_score',
#'overall_well_being',
'age',
'female']



len(metric_new)

### 4.2 - RFE

In [None]:
# Define train data
X_train = df_nonoise[metric_new]
Y_train = df_nonoise['lifestyle_type']

In [None]:

#model = RandomForestClassifier(n_jobs=14)


#param_grid = {
#    'n_features_to_select': np.arange(1, 14)
#}


#rfe = RFE(estimator=model)

# Define GridSearchCV
#grid_search = GridSearchCV(estimator=rfe, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=14)

# Fit the GridSearchCV
#grid_search.fit(X_train, Y_train)

# Get the best parameters and best score
#best_params = grid_search.best_params_
#best_score = grid_search.best_score_

#print("Best number of features to select: ", best_params['n_features_to_select'])
#print("Best cross-validation score: %f" % best_score)


In [None]:
## PUT HERE THE PARAMETERS NUMBER

In [None]:
# Use RFE to select the optimal number of features (11)
#rfe = RFE(estimator=model, n_features_to_select=11)

# Fit and transform the training data
#rfe_fitted = rfe.fit_transform(X_train, Y_train)

# Get the support mask of the selected features
#selected_features_mask = rfe.support_

# Create a series with the selected feature names
#selected_features = pd.Series(selected_features_mask, index=X_train.columns)
#selected_feature_names = selected_features[selected_features].index.tolist()

# Print the selected feature names
#print("Selected feature names: ", selected_feature_names)

In [None]:
selected_feature_names = ['environmental_awareness_rating',
 'financial_wellness_index',
   'investment_portfolio_value',
     'investments_risk_appetite',
       'investments_risk_tolerance',
         'tech_savviness_score',
           'social_media_influence_score',
             'avg_monthly_entertainment_expenses',
               'avg_weekly_exercise_hours',
                 'health_consciousness_rating',
                   'stress_management_score']

#### 4.3 - Final Features

In [None]:
final_features = selected_feature_names
final_features

# 5 - Final Datasets for Model Training

In [None]:
# Final Train Set
X_train = df_nonoise[final_features]

#Final test/submission set.
X_test = dft_im[final_features]

#Train and validation sets. Validation set is a subset of train set. -> for troubleshooting purposes
R_train, R_val, RY_train, RY_val = train_test_split(X_train, Y_train, test_size=0.2, random_state=42)

# 6 - Models

### 6.0 - Evaluation Functions

In [None]:
def model_eval(model, train, test, target_train, target_test):
    ft = model.fit(train,target_train)
    pred = ft.predict(train)
    predt = ft.predict(test)

    train_f1 = f1_score(target_train, pred, average='weighted')
    test_f1 = f1_score(target_test, predt, average='weighted')

    evaluation = {
        'F1 Score': [train_f1, test_f1]
    }

    return evaluation

### 6.1 - One-Vs-Rest Logistic Regression

One-vs-Rest Approach: Since we have multiple categories (more than two). This means building separate models for each lifestyle type, comparing it to all others.

In [None]:
#param_grid = {
#    'penalty': ['l1', 'l2'],
#    'C': [0.01, 0.1, 1, 10, 100],
#   'l1_ratio': [0, 0.25, 0.5, 0.75, 1]
#}


#algrthm = LogisticRegression(multi_class='ovr',random_state= 1 )


#clf = GridSearchCV(algrthm, param_grid, cv=5, scoring='f1_micro')
#clf.fit(X_train, Y_train)

#results = clf.cv_results_

#params = results['params']
#mean_test_scores = results['mean_test_score']


 #Convert the results to a DataFrame
#results_df = pd.DataFrame(results)

# Sort the DataFrame by mean test score in descending order
#sorted_results_df = results_df.sort_values(by='mean_test_score', ascending=False)

# Display the best parameters and corresponding score
#best_result = sorted_results_df.iloc[0]
#LogReg = sorted_results_df[['params','mean_test_score','mean_fit_time']]

In [None]:
#LogReg.head()

### 6.2 - Multinomial Logistic Regression

In [None]:
# Initialize the Logistic Regression model with multinomial logistic regression
#param_grid = {
#    'penalty': ['l1', 'l2', 'elasticnet'],
#    'C': [0.01, 0.1, 1, 10, 100],
#   'l1_ratio': [0, 0.25, 0.5, 0.75, 1]  # Only used if penalty='elasticnet'
#}


#algrthm = LogisticRegression(multi_class='multinomial', random_state= 1 )


#clf = GridSearchCV(algrthm, param_grid, cv=5, scoring='f1_micro')
#clf.fit(X_train, Y_train)

#results = clf.cv_results_

#params = results['params']
#mean_test_scores = results['mean_test_score']


 #Convert the results to a DataFrame
#results_df = pd.DataFrame(results)

# Sort the DataFrame by mean test score in descending order
#sorted_results_df = results_df.sort_values(by='mean_test_score', ascending=False)

# Display the best parameters and corresponding score
#best_result = sorted_results_df.iloc[0]
#MultinomialReg = sorted_results_df[['params','mean_test_score','mean_fit_time']]

In [None]:

#MultinomialReg.head()

### 6.3 - Decision Tree

In [None]:
# Example usage

#param_grid = {
#    'criterion': ['gini', 'entropy'],
#    'max_depth': [50,100,200,300],
#    'min_samples_split': [20,30,40],
#    'min_samples_leaf': [20,30,40],
#    'max_features': ['sqrt', 'log2'],
#    'max_leaf_nodes': [50, 100, 200],
#}

#algrthm = DecisionTreeClassifier(random_state=1)


#clf = GridSearchCV(algrthm, param_grid, cv=5, scoring='f1_micro')
#clf.fit(X_train, Y_train)

#results = clf.cv_results_

#params = results['params']
#mean_test_scores = results['mean_test_score']


 #Convert the results to a DataFrame
#results_df = pd.DataFrame(results)

# Sort the DataFrame by mean test score in descending order
#sorted_results_df = results_df.sort_values(by='mean_test_score', ascending=False)

# Display the best parameters and corresponding score
#best_result = sorted_results_df.iloc[0]
#DecisionTree = sorted_results_df[['params','mean_test_score','mean_fit_time']]


In [None]:
#DecisionTree.head()

### 6.4 - Random Forest

In [None]:
#param_grid = {
    #'max_depth': [200, 500],
    #'n_estimators': [500, 1000],
    #'max_features': ['sqrt', 'log2']
#}
#algrthm = RandomForestClassifier(n_jobs = 18, random_state= 1)


#clf = GridSearchCV(algrthm, param_grid, cv=5, scoring='f1_micro')
#clf.fit(X_train, Y_train)

#results = clf.cv_results_

#params = results['params']
#mean_test_scores = results['mean_test_score']


 #Convert the results to a DataFrame
#results_df = pd.DataFrame(results)

# Sort the DataFrame by mean test score in descending order
#sorted_results_df = results_df.sort_values(by='mean_test_score', ascending=False)

# Display the best parameters and corresponding score
#best_result = sorted_results_df.iloc[0]
#RndFrst = sorted_results_df[['params','mean_test_score','mean_fit_time']]

In [None]:
#RndFrst.head()

In [None]:
#rf = RandomForestClassifier(n_jobs=14, max_depth = 50, n_estimators= 800, random_state= 1, max_features= 'sqrt')
#rf.fit(R_train,RY_train)
#model_eval(rf,R_train,R_val,RY_train,RY_val)

### 6.5 - Neural Network

In [None]:


#param_grid = {
#    'hidden_layer_sizes': [(200,100,50,30),(200,100), (200,100,50)],
#    'activation': ['relu'],
#    'solver': ['adam'],
#    'alpha': [0.1],
#    'learning_rate': ['adaptive'],
#    'learning_rate_init': [0.001],
#   'max_iter': [100,200,300],
#    'n_iter_no_change':[10]
#}

#algrthm = MLPClassifier(random_state= 1)


#clf = GridSearchCV(algrthm, param_grid, cv=5, scoring='f1_micro')
#clf.fit(X_train, Y_train)

#results = clf.cv_results_

#params = results['params']
#mean_test_scores = results['mean_test_score']


 #Convert the results to a DataFrame
#results_df = pd.DataFrame(results)

# Sort the DataFrame by mean test score in descending order
#sorted_results_df = results_df.sort_values(by='mean_test_score', ascending=False)

# Display the best parameters and corresponding score
#best_result = sorted_results_df.iloc[0]
#MLPC = sorted_results_df[['params','mean_test_score','mean_fit_time']]





## 6.6 - Ada Boost

In [None]:
#param_grid = {
#    'n_estimators': [100, 200 , 500, 700],
#    'learning_rate': [0.001 ,0.01, 0.1, 1.0],
#    'algorithm': ['SAMME', 'SAMME.R']
#}

#algrthm = AdaBoostClassifier(random_state=1)


#clf = GridSearchCV(algrthm, param_grid, cv=5, scoring='f1_micro')
#clf.fit(X_train, Y_train)

#results = clf.cv_results_

#params = results['params']
#mean_test_scores = results['mean_test_score']


 #Convert the results to a DataFrame
#results_df = pd.DataFrame(results)

# Sort the DataFrame by mean test score in descending order
#sorted_results_df = results_df.sort_values(by='mean_test_score', ascending=False)

# Display the best parameters and corresponding score
#best_result = sorted_results_df.iloc[0]
#ADBC = sorted_results_df[['params', 'mean_test_score', 'mean_fit_time']]

#ADBC



In [None]:
#pd.set_option('display.max_rows', None)  # Show all rows
#pd.set_option('display.max_columns', None)  # Show all columns
#pd.set_option('display.width', None)  # Set the display width to None
#pd.set_option('display.max_colwidth', None)  # Set the maximum column width to None
#ADBC.head

# 7 - Final Models

In [None]:
#Submission 09
#Nn3 =  MLPClassifier(hidden_layer_sizes= (200,100,50,25,15,10,5),
#                   learning_rate='constant',alpha=0.1, random_state = 1, activation= 'relu',
#                     max_iter=300, solver = 'adam', verbose=True, tol = 0.0005
#                     )
#Nn3.fit(X_train,Y_train)

In [None]:
#Submission 10 - BEST MODEL UNTIL 09/06/2024 00:35
Nn2= MLPClassifier(hidden_layer_sizes= (200,100,50), learning_rate='adaptive',alpha=0.1, random_state = 1, activation= 'relu', max_iter=300, solver = 'adam', verbose=True)
Nn2.fit(X_train,Y_train)


In [None]:
#model_eval(model, train, test, target_train, target_test):
#model = Nn2

#model_eval(model,R_train,R_val,RY_train, RY_val)

In [None]:
#BEST MODEL UNTIL 08/06/2024 23:49 - submisson 05
#Nn1= MLPClassifier(hidden_layer_sizes= (100,50,30,20),learning_rate='adaptive',alpha=0.1, random_state = 1)
#Nn1.fit(X_train,Y_train)


In [None]:
#Nn= MLPClassifier(hidden_layer_sizes= (100,50),learning_rate='adaptive',alpha=0.1)
#Nn.fit(X_train,Y_train)


### Outputs

In [None]:
 #predictions = Nn2.predict(X_test[final_features])

#predictions_df = pd.DataFrame({'citizen_id': X_test.index, 'lifestyle_type': predictions})

 #Seting the destination directory
#destination_directory = ''

 #Saving the DataFrame to a CSV file in the specified directory
#predictions_df.to_csv(os.path.join(destination_directory, 'NT_Group17__Version.csv'), index=False)