In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
from datetime import datetime
from collections import Counter
from subprocess import check_output

In [None]:
path_to_data = "/media/raph/Elements/ml1/churn/"
nrows_for_members = None
nrows_for_train = 10000
nrows_for_train_v2 = 10000
nrows_for_transactions = 300000
nrows_for_transactions_v2 = 300000
nrows_for_test = None
nrows_for_test_v2 = None

In [None]:
train = pd.read_csv(path_to_data+"train_v2.csv", nrows=nrows_for_train_v2)
train = pd.concat((train, pd.read_csv(path_to_data+"train.csv", nrows=nrows_for_train)), axis=0, ignore_index=True).reset_index(drop=True)
members = pd.read_csv(path_to_data + "members_v3.csv", nrows=nrows_for_members)
transactions = pd.read_csv(path_to_data+"transactions.csv", nrows=nrows_for_transactions)
transactions = pd.concat((transactions, pd.read_csv(path_to_data+"transactions_v2.csv", nrows=nrows_for_transactions_v2)), axis=0, ignore_index=True).reset_index(drop=True)
test = pd.read_csv(path_to_data+"sample_submission_v2.csv", nrows=nrows_for_test_v2)
test = pd.concat((test, pd.read_csv(path_to_data+"sample_submission_zero.csv", nrows=nrows_for_test)), axis=0, ignore_index=True).reset_index(drop=True)

In [None]:
training = pd.merge(left = train,right = members,how = 'left',on=['msno'])
del train
testing = pd.merge(left = test,right = members,how = 'left',on=['msno'])
del test
del members

In [None]:
# changing type to int and putting -1 for missing values
training['city'] = training.city.apply(lambda x: int(x) if pd.notnull(x) else -1)
training['registered_via'] = training.registered_via.apply(lambda x: int(x) if pd.notnull(x) else -1)
training['bd'] = training.bd.apply(lambda x: int(x) if pd.notnull(x) else -1)
training['bd'] = training.bd.apply(lambda x: x if (10<x<100) else -1)

# encode gender
genders_encoding = {'male': 0, 'female': 1}
training['gender'] = training.gender.apply(lambda x: genders_encoding[x] if pd.notnull(x) else -1)

# changing date formats
training['registration_init_time'] = training.registration_init_time.apply(lambda x: datetime.strptime(str(int(x)), "%Y%m%d").date() if pd.notnull(x) else "NAN")
transactions['transaction_date'] = transactions.transaction_date.apply(lambda x: datetime.strptime(str(int(x)), "%Y%m%d").date() if pd.notnull(x) else "NAN")
transactions['membership_expire_date'] = transactions.membership_expire_date.apply(lambda x: datetime.strptime(str(int(x)), "%Y%m%d").date() if pd.notnull(x) else "NAN")
transactions['payment_method_id'] = transactions.payment_method_id.apply(lambda x: int(x) if pd.notnull(x) else -1)

In [None]:
transactions = transactions.drop(transactions["transaction_date"] < datetime.strptime("20170201", "%Y%m%d").date())

In [None]:
def change_datatype(df):
    int_cols = list(df.select_dtypes(include=['int']).columns)
    for col in int_cols:
        if ((np.max(df[col]) <= 127) and(np.min(df[col] >= -128))):
            df[col] = df[col].astype(np.int8)
        elif ((np.max(df[col]) <= 32767) and(np.min(df[col] >= -32768))):
            df[col] = df[col].astype(np.int16)
        elif ((np.max(df[col]) <= 2147483647) and(np.min(df[col] >= -2147483648))):
            df[col] = df[col].astype(np.int32)
        else:
            df[col] = df[col].astype(np.int64)

def change_datatype_float(df):
    float_cols = list(df.select_dtypes(include=['float']).columns)
    for col in float_cols:
        df[col] = df[col].astype(np.float32)
        
def memory_usage(df):
    mem = df.memory_usage(index=True).sum()
    return mem/ 1024**2," MB"

In [None]:
# reducing memory usage:
change_datatype(training)
change_datatype_float(training)

change_datatype(transactions)
change_datatype_float(transactions)

change_datatype(testing)
change_datatype_float(testing)

In [None]:
training.set_index('msno', inplace=True)
user_count = Counter(transactions['msno']).most_common()

In [None]:
user_count_2 = pd.DataFrame(user_count)
user_count_2.columns = ['msno', 'number_of_transactions']
user_count_2.set_index('msno', inplace=True)
training = pd.merge(left = training,right = user_count_2, how = 'left', left_index=True, right_index=True)
training.describe()

In [None]:
training.head()

In [None]:
training['number_of_transactions'] = training.number_of_transactions.apply(lambda x: int(x) if pd.notnull(x) else 0)
training.describe()

In [None]:
transactions_2 = transactions.groupby(by=['msno'], sort=False)

In [None]:
usual_payment_plan = pd.DataFrame(transactions_2["payment_plan_days"].mean())
usual_payment_plan.columns = ["usual_payment_plan_days"]
training = pd.merge(left=training, right=usual_payment_plan, how = 'left', left_index=True, right_index=True)
training['usual_payment_plan_days'] = training.usual_payment_plan_days.apply(lambda x: int(x) if pd.notnull(x) else 0)
training.head()

In [None]:
transactions.head()

In [None]:
transactions["price_per_day"] = transactions["actual_amount_paid"]/(transactions["payment_plan_days"]+0.01)
transactions.head()

In [None]:
transactions_2 = transactions.groupby(by=['msno'], sort=False)
usual_price_per_day = pd.DataFrame(transactions_2["price_per_day"].mean())
usual_price_per_day.columns = ["usual_price_per_day"]
training = pd.merge(left=training, right=usual_price_per_day, how = 'left', left_index=True, right_index=True)
training['usual_price_per_day'] = training.usual_price_per_day.apply(lambda x: x if pd.notnull(x) else 0)
training.head()

In [None]:
recent_transactions = transactions.sort_values(['transaction_date']).groupby('msno').first()

In [None]:
recent_transactions.head()

In [None]:
training = pd.merge(left=training, right=recent_transactions, how = 'left', on=['msno'])
training.head()