In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.linear_model import LogisticRegression

In [None]:
# 1- Inserting dataset with pandas

data = pd.read_csv("../input/aer-credit-card-data/AER_credit_card_data.csv")
data.head()

In [None]:
# 2- Looking for missing data

data.isnull().sum(axis=0)

In [None]:
# As we can see, there is no missing value in this dataset

In [None]:
# 3- Describing dataset data.describe()

data.describe()

In [None]:
# In this table, we can find a simple statistical report for each attribute.
# For instance, the standard deviation for the "report" column is about 1.345267.

In [None]:
data.describe

In [None]:
# 4- Finding duplicate values

data.duplicated().sum()

In [None]:
# There is no duplicate value in this dataset

In [None]:
# 5- Data Distributions

Median = []
k = ["age", "income", "share"]
for i in k:
    Median.append(data[i].median())
Median

In [None]:
# The above list shows the median values for "Age", "Income", and "Share" respectively.

In [None]:
# 6- Normalized histograms (Distribution)

age = data["age"]
income = data["income"]
share = data["share"]
fig = plt.figure(figsize=(15, 3))
plt.subplot(1, 3, 1)
plt.xlabel("Age")
plt.ylabel("PMF")
plt.title("Age Distributions")
age.hist(histtype="bar", bins=20, alpha=0.5, color="r", density=1)
plt.subplot(1, 3, 2)
plt.xlabel("Income")
plt.ylabel("PMF")
plt.title("Income Distributions")
income.hist(histtype="bar", bins=20, alpha=0.5, color="b", density=1)
plt.subplot(1, 3, 3)
plt.xlabel("Share")
plt.ylabel("PMF")
plt.title("Share Distributions")
share.hist(histtype="bar", bins=20, alpha=0.5, color="g", density=1)

In [None]:
# 7- Cumulative Distribution Function

fig = plt.figure(figsize=(15, 3))
plt.subplot(1, 3, 1)
plt.xlabel("Age")
plt.ylabel("CDF")
plt.title("Age Distributions")
age.hist(
    histtype="step",
    cumulative=True,
    linewidth=3,
    bins=20,
    color="r",
    density=1,
)
plt.subplot(1, 3, 2)
plt.xlabel("Income")
plt.ylabel("CDF")
plt.title("Income Distributions")
income.hist(
    histtype="step",
    cumulative=True,
    linewidth=3,
    bins=20,
    color="b",
    density=1,
)
plt.subplot(1, 3, 3)
plt.xlabel("Share")
plt.ylabel("CDF")
plt.title("Share Distributions")
share.hist(
    histtype="step",
    cumulative=True,
    linewidth=3,
    bins=20,
    color="g",
    density=1,
)

In [None]:
# 8-Outlier

lower_b = age < age.median() - 15
upper_b = age > age.median() + 30
data_n = data.drop(data.index[lower_b], axis=0)
data_n = data_n.drop(data.index[upper_b], axis=0)
data_n["age"].plot(color="r", alpha=0.6)
data["age"].plot(color="b", alpha=0.4)

In [None]:
# In the above cell, I used the median of the age to determine the outliers. 
#Based on the problem we can consider this method for other attributes as well. 
#The plot points out that we do not have outliers. 
#The blue plot refers to the age values before deleting outliers, and the red plot 
#(which is more reddish) is related to age values after dropping outliers values.

In [None]:
# 9-Measuring Asymmetry

data = data_n.sort_values(by="active", ascending=True)
age = data["age"]

def skewness(x):
    res = 0
    m = x.mean()
    s = x.std()
    for i in x:
        res += (i - m) * (i - m) * (i - m)
        res /= len(x) * s * s * s
    return res


skewness(age)

In [None]:
# The skewness of age is about 0.0036.
# Moreover, I sorted the new data frame based on the Active column.

In [None]:
# 10-Calculating the correlation

data.corr()

In [None]:
plt.figure(figsize=(7, 5))
plt.title("Correlation Heatmap")
heatmap = sns.heatmap(data.corr(), vmin=-1, vmax=1, annot=True, cmap="BrBG")

In [None]:
# 10- Label transformation

k = ["card", "owner", "selfemp"]
for i in k:
    le = LabelEncoder()
    n = str(i) + "_n"
    data[n] = le.fit_transform(data[i])
for i in k:
    del data[i]
data.head()

# I transformed labels of Card, owner, and selfemp to implement regression.
# Yes is equal to 1 and No is equal to 0.

In [None]:
# 11- Implement regression

X = data.drop("card_n", axis=1)
y = data["card_n"]

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
clf = LogisticRegression(C=1e5)
clf.fit(x_train, y_train)
pred = clf.predict(x_test)
clf.score(x_test, y_test)