In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

In [None]:
train_data = pd.read_feather("../input/amexfeather/train_data.ftr")

In [None]:
train_data.head()

In [None]:
train_data.dtypes

### Datatypes present in training data
1) We have 1 object type which is customer id

2) We have 1 datetime feature which is S_2

3) We have 11 categorical features

4) We have 178 features which are either float or integer

##### Note : This is not the original dataset. I am using the dataset present in the [link](http://www.kaggle.com/datasets/munumbutt/amexfeather) as the original is quite heavy and is not fitting in the memory allocated in kaggle notebook

In [None]:
train_data.describe(include="all",datetime_is_numeric=True).T

In [None]:
train_data.info(max_cols=200, show_counts=True)

In [None]:
train_data.isnull().sum()

In [None]:
sns.set()
sns.countplot(x=train_data["target"])

#### Looks like we have skewed dataset. The number of samples in the positive class is too high than in the negative class

In [None]:
temp_df = pd.DataFrame(train_data.target.value_counts() *100/ train_data.shape[0]).reset_index().\
            rename(columns={"index":"Target Labels","target":"Percentage of Distribution"})
VALUES = temp_df["Percentage of Distribution"].values
LABELS = ["Non Default", "Default"]
COLORS = ["#F0F8FF","#00FFFF"]
fig = go.Figure(data=[go.Pie(labels=LABELS, values=VALUES,marker=dict(colors=COLORS))])
fig.show()

#### We have ***75% positive*** class and approx ***25% negative*** class

In [None]:
list_of_features_having_null = [feature for feature in train_data.columns 
                                if train_data[feature].isnull().sum() > 0]

In [None]:
len(list_of_features_having_null)

#### There are **121 columns** having missing values. We need to decide either to impute them or drop them

In [None]:
#finding percentage of missing values for each feature
percent_missing = train_data.isnull().sum() * 100 / len(train_data)
missing_value_df = pd.DataFrame({'column_name': train_data.columns,
                                 'percent_missing': percent_missing}).reset_index().drop("index",axis=1)

missing_value_df.sort_values('percent_missing',ascending=False)

# missing_value_df.loc[missing_value_df.percent_missing > 50]

In [None]:
px.bar(missing_value_df,x="column_name",y="percent_missing",\
       title="Percentage of Missing Values in the columns having Missing values")

#### Many of the columns having almost ***100% missing*** values. We can remove them from the train and test datasets as we don't get any meaningful information if we impute them

In [None]:
categorical_features = [feature for feature in train_data.columns if train_data[feature].dtypes == "category"]

In [None]:
categorical_features

In [None]:
datetime_features = train_data["S_2"]
datetime_features

In [None]:
continous_features = [feature for feature in train_data.columns if train_data[feature].dtypes \
                      in ["int16","int32","int64","float16", "float32", "float64"]
                      and feature not in categorical_features and "S_2"]

In [None]:
time_df = pd.DataFrame(datetime_features)
time_df = pd.DataFrame({"Month of Spending":time_df["S_2"].dt.month,
                       "Year of Spending" : time_df["S_2"].dt.year })

In [None]:
pd.DataFrame(time_df.loc[time_df["Year of Spending"] == 2017]["Month of Spending"].\
             value_counts()).plot(kind="bar",
                                  figsize=(15,8),
                                  title="Active Number of Customers in the year 2017",
                                  xlabel="Month in year 2017",
                                  ylabel="Active Number of Customers")

In [None]:
time_df.loc[time_df["Year of Spending"] == 2018]["Month of Spending"].\
        value_counts().plot(kind="bar",
                            figsize=(15,8),
                            title="Active Number of Customers in the year 2018",
                            xlabel="Month in year 2018",
                            ylabel="Active Number of Customers")

#### From above 2 figures we see that maximum number of customers active in the month of December in 2017 while for 2018 we only have data for first 3 months. So in year 2018 the maximum number of customers active in March month

## Please do provide your feedback