# Exploratory data analysis and feature engineering

## 1. Data preparation

In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import csv

### Preprocess the data train

In [10]:
dat_test = pd.read_csv("MLUnige2023_subscriptions_test.csv")
dat_train = pd.read_csv("MLUnige2023_subscriptions_train.csv")

In [11]:
# Handle missing values
dat_train = dat_train.fillna('unknown')

# Encode categorical variables
dat_train = pd.get_dummies(dat_train, columns=['job', 'marital', 'education', 'device', 'outcome_old', 'X1', 'X2', 'X3'])

# Split the data into features and target
X_train = dat_train.drop(['Id', 'subscription'], axis=1)
y_train = dat_train['subscription']

In [12]:
# Split the training data into training and validation sets
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Create and train the model
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_split, y_train_split)

# Predict on the validation set
y_val_pred = clf.predict(X_val_split)

# Evaluate the model
print("Accuracy:", accuracy_score(y_val_split, y_val_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_val_split, y_val_pred))
print("Classification Report:")
print(classification_report(y_val_split, y_val_pred))

Accuracy: 0.8531546621998883
Confusion Matrix:
[[913 143]
 [120 615]]
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.86      0.87      1056
           1       0.81      0.84      0.82       735

    accuracy                           0.85      1791
   macro avg       0.85      0.85      0.85      1791
weighted avg       0.85      0.85      0.85      1791



### Preprocess the data test

In [13]:
# Preprocess the test data (similar to the training data)
dat_test = dat_test.fillna('unknown')
dat_test = pd.get_dummies(dat_test, columns=['job', 'marital', 'education', 'device', 'outcome_old', 'X1', 'X2', 'X3'])

# Align columns of test and training data
X_test = dat_test.drop('Id', axis=1).reindex(columns=X_train.columns, fill_value=0)

# Make predictions
y_test_pred = clf.predict(X_test)

In [24]:
print(y_test_pred)
y_test_pred.shape



[0 1 0 ... 0 0 1]


(3837,)

In [43]:
Predictions_On_Test = pd.DataFrame({'subscription': y_test_pred}).reset_index()

Predictions_On_Test = Predictions_On_Test.rename(columns={Predictions_On_Test.columns[0]: 'Id'})

pd.DataFrame(Predictions_On_Test).to_csv("Predictions_On_Test.csv",index=False)

Unnamed: 0,Id,subscription
0,0,0
1,1,1
2,2,0
3,3,0
4,4,1
...,...,...
3832,3832,0
3833,3833,0
3834,3834,0
3835,3835,0
