## Imports

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime


# from sklearn.linear_model import LogisticRegression #dont need
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier, plot_tree

from src.utils.pandas_extensions import one_hot_encoding

## Load data

In [198]:
unprocessed_data = pd.read_csv("../data/raw/train.csv")

In [199]:
def one_hot_encoding(dataframe: pd.DataFrame, column: str) -> pd.DataFrame:
    if (column not in dataframe.columns):
        raise AssertionError(f"Column {column} doesn't exist")
    
    nparr = dataframe[column].value_counts()
    
    for new_col in nparr.index:
        new_series = dataframe[column] == new_col

        # dataframe = pd.concat([dataframe, new_series], axis=1)
        dataframe[new_col] = new_series
        # print(type(dataframe[column] == new_col))

    return dataframe

In [200]:
np.random.seed(311)
processed_data = unprocessed_data.dropna()
processed_data = one_hot_encoding(processed_data, "Subscription Type")
# processed_data.loc[processed_data["Support Calls"] == "none"] = 0
X = processed_data[["Age", "Gender", "Tenure", "Usage Frequency", "Support Calls", "Customer Status", "Last Interaction", "Payment Delay"]]
y = processed_data["Churn"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe[new_col] = new_series
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe[new_col] = new_series
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe[new_col] = new_series


In [204]:
print(unprocessed_data.head(n=20))
print(processed_data.head())

    CustomerID  Age  Gender  Tenure  Usage Frequency Support Calls  \
0       160831   34    Male     NaN               26          none   
1       354435   30    Male    36.0               22           NaN   
2       444576   23    Male    44.0               14          none   
3       130847   24    Male    39.0               21             2   
4       108121   26    Male     NaN               26             1   
5       301090   19  Female     2.0               17             8   
6       302075   24  Female     NaN               26             6   
7       140302   21    Male     NaN                6             6   
8       431332   45    Male    42.0               26           NaN   
9       135193   27    Male     4.0                4            10   
10      157554   49  Female     NaN               22             3   
11      463693   42    Male     NaN               23             2   
12      468959   30    Male    54.0               13             3   
13       77242   61 

In [106]:
#try ph with depth 1, ph with depth 4, ph with depth None
#try N as the features with the same depth value
#try cycle through the rest of the fa
features = ["Usage Frequency", "Age", "Support Calls", "Last Interaction", "Payment Delay"]
depth_limit = 16
model = DecisionTreeClassifier(criterion='entropy', max_depth=depth_limit)
model.fit(X_train[features], y_train)

#Predict on train data
y_pred_train = model.predict(X_train[features])

#Predict on test data
y_pred_test = model.predict(X_test[features])

train_accuracy = metrics.accuracy_score(y_train, y_pred_train)
train_f1_score = metrics.f1_score(y_train, y_pred_train)
testing_accuracy = metrics.accuracy_score(y_test, y_pred_test)
testing_f1_score = metrics.f1_score(y_test, y_pred_test)

print("---------Training performance---------")
print(f"{train_accuracy=}")
print(f"{train_f1_score=}")
print("---------Testing performance---------")
print(f"{testing_accuracy=}")
print(f"{testing_f1_score=}")

---------Training performance---------
train_accuracy=0.9980622984325852
train_f1_score=0.9075949367088607
---------Testing performance---------
testing_accuracy=0.9913291217639044
testing_f1_score=0.6205962059620597


In [None]:
plt.figure(figsize=(3,3))
plot_tree(model, feature_names=features, filled=True)
plt.title(f"Descision Tree ({features=}, {depth_limit=})")

## Export

In [107]:
test_data = pd.read_csv("../data/raw/test.csv")
# test_data.loc[test_data["Support Calls"] == "none"]["Support Calls"] = 0
test_data = test_data.replace("none", 0)
test_pred = model.predict(test_data[features])
test_data.value_counts()

CustomerID  Age  Gender  Tenure  Usage Frequency  Support Calls  Payment Delay  Subscription Type  Contract Length  Total Spend  Last Interaction  Last Due Date  Last Payment Date  Customer Status
25          42   Female  29.0    11               2              27             Premium            Monthly          505.00       19.0              06-22          07-19              active             1
336889      47   Female  21.0    23               5              24             Standard           Monthly          722.00       17.0              06-26          07-20              active             1
336899      24   Female  46.0    16               5              10             Basic              Quarterly        194.89       12.0              06-12          06-22              active             1
336909      37   Male    38.0    21               4              15             Standard           Monthly          718.00       2.0               06-07          06-22              active          

In [111]:
export = test_data[["CustomerID"]]
export["Churn"] = test_pred

date = datetime.datetime.now()
export.to_csv("../data/processed/export" + str(date).replace(":", "") + ".csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  export["Churn"] = test_pred
