In [15]:
import pandas as pd
import numpy as np
import csv
from ydata_profiling import ProfileReport
import seaborn as sc

In [16]:
sports_data_orig = pd.read_csv('XYZ_sports_dataset.csv', delimiter =";")
sports_data = sports_data_orig

In [17]:
sports_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14942 entries, 0 to 14941
Data columns (total 31 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   ID                          14942 non-null  int64  
 1   Age                         14942 non-null  int64  
 2   Gender                      14942 non-null  object 
 3   Income                      14447 non-null  float64
 4   EnrollmentStart             14942 non-null  object 
 5   EnrollmentFinish            14942 non-null  object 
 6   LastPeriodStart             14942 non-null  object 
 7   LastPeriodFinish            14942 non-null  object 
 8   DateLastVisit               14942 non-null  object 
 9   DaysWithoutFrequency        14942 non-null  int64  
 10  LifetimeValue               14942 non-null  float64
 11  UseByTime                   14942 non-null  int64  
 12  AthleticsActivities         14906 non-null  float64
 13  WaterActivities             149

<h4>Change data types

In [18]:
sports_data['EnrollmentStart'] = pd.to_datetime(sports_data['EnrollmentStart'])
sports_data['EnrollmentFinish'] = pd.to_datetime(sports_data['EnrollmentFinish'])
sports_data['LastPeriodStart'] = pd.to_datetime(sports_data['LastPeriodStart'])
sports_data['LastPeriodFinish'] = pd.to_datetime(sports_data['LastPeriodFinish'])
sports_data['DateLastVisit'] = pd.to_datetime(sports_data['DateLastVisit'])

Binary encode of gender

In [19]:
sports_data.value_counts()

ID     Age  Gender  Income  EnrollmentStart  EnrollmentFinish  LastPeriodStart  LastPeriodFinish  DateLastVisit  DaysWithoutFrequency  LifetimeValue  UseByTime  AthleticsActivities  WaterActivities  FitnessActivities  DanceActivities  TeamActivities  RacketActivities  CombatActivities  NatureActivities  SpecialActivities  OtherActivities  NumberOfFrequencies  AttendedClasses  AllowedWeeklyVisitsBySLA  AllowedNumberOfVisitsBySLA  RealNumberOfVisits  NumberOfRenewals  HasReferences  NumberOfReferences  Dropout
10001  29   Female  2630.0  2014-08-12       2015-09-14        2015-01-01       2015-12-31        2015-07-16     60                    479.20         0          0.0                  0.0              0.0                0.0              0.0             0.0               0.0               0.0               1.0                0.0              23.0                 1                2.0                       17.42                       1                   2                 0.0            

In [20]:
sports_data['Gender'] = sports_data['Gender'].map({'Male': 1, 'Female': 0})

In [21]:
sports_data['Gender'].dtype

dtype('int64')

<h1> Data Exploration </h1>

In [22]:
# replace "" by nans
sports_data.replace("", np.nan, inplace=True)

# count of missing values
sports_data.isna().sum()

ID                              0
Age                             0
Gender                          0
Income                        495
EnrollmentStart                 0
EnrollmentFinish                0
LastPeriodStart                 0
LastPeriodFinish                0
DateLastVisit                   0
DaysWithoutFrequency            0
LifetimeValue                   0
UseByTime                       0
AthleticsActivities            36
WaterActivities                37
FitnessActivities              35
DanceActivities                36
TeamActivities                 35
RacketActivities               37
CombatActivities               33
NatureActivities               47
SpecialActivities              44
OtherActivities                35
NumberOfFrequencies            26
AttendedClasses                 0
AllowedWeeklyVisitsBySLA      535
AllowedNumberOfVisitsBySLA      0
RealNumberOfVisits              0
NumberOfRenewals                0
HasReferences                  12
NumberOfRefere

<h3> Age

In [23]:
#set income as zero when the age is under 16 
sports_data.loc[(sports_data['Age'] < 16), 'Income'] = 0

In [24]:
#only consider fitness activities for customers over 16

drop_fitness_above_16 = sports_data[(sports_data['FitnessActivities'] == 1) & (sports_data['Age'] < 16)].index
sports_data.drop(drop_fitness_above_16, inplace=True)

sports_data[(sports_data['FitnessActivities'] == 1) & (sports_data['Age'] < 16)]

Unnamed: 0,ID,Age,Gender,Income,EnrollmentStart,EnrollmentFinish,LastPeriodStart,LastPeriodFinish,DateLastVisit,DaysWithoutFrequency,...,OtherActivities,NumberOfFrequencies,AttendedClasses,AllowedWeeklyVisitsBySLA,AllowedNumberOfVisitsBySLA,RealNumberOfVisits,NumberOfRenewals,HasReferences,NumberOfReferences,Dropout


In [25]:
pd.set_option('display.max_columns', None)
#Drop every child before 4 years old who is doing other activities that are not water

activities_before_4_years = (sports_data['Age'] < 4) & (
    ((sports_data['TeamActivities'] == 1) | (sports_data['SpecialActivities'] == 1) | (sports_data['CombatActivities'] == 1) |
     (sports_data['RacketActivities'] == 1) | (sports_data['AthleticsActivities'] == 1) | (sports_data['FitnessActivities'] == 1) |
     (sports_data['DanceActivities'] == 1) | (sports_data['NatureActivities'] == 1))
)

sports_data.drop(sports_data[activities_before_4_years].index, inplace=True)

In [26]:
sports_data[sports_data['EnrollmentStart'] == sports_data['EnrollmentFinish']]

Unnamed: 0,ID,Age,Gender,Income,EnrollmentStart,EnrollmentFinish,LastPeriodStart,LastPeriodFinish,DateLastVisit,DaysWithoutFrequency,LifetimeValue,UseByTime,AthleticsActivities,WaterActivities,FitnessActivities,DanceActivities,TeamActivities,RacketActivities,CombatActivities,NatureActivities,SpecialActivities,OtherActivities,NumberOfFrequencies,AttendedClasses,AllowedWeeklyVisitsBySLA,AllowedNumberOfVisitsBySLA,RealNumberOfVisits,NumberOfRenewals,HasReferences,NumberOfReferences,Dropout
25,10025,13,0,0.0,2015-09-09,2015-09-09,2019-07-01,2019-12-31,2019-10-26,5,596.30,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27.0,18,2.0,6.28,3,4,0.0,0,0
36,10036,13,0,0.0,2014-10-02,2014-10-02,2019-07-01,2019-12-31,2019-10-22,9,2209.97,0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,199.0,166,2.0,17.14,5,5,1.0,1,0
38,10038,38,0,1910.0,2018-09-29,2018-09-29,2019-07-01,2019-12-31,2019-10-30,1,554.60,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,127.0,125,3.0,26.13,25,1,0.0,0,0
49,10049,56,1,6540.0,2018-09-03,2018-09-03,2019-07-01,2019-12-31,2019-10-29,2,540.10,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,112.0,0,7.0,60.97,32,1,0.0,0,0
51,10051,12,0,0.0,2018-08-17,2018-08-17,2019-07-01,2019-12-31,2019-10-28,3,852.30,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,42.0,42,4.0,34.84,4,2,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14918,24918,73,1,3870.0,2014-12-10,2014-12-10,2019-07-01,2019-12-31,2019-10-31,0,1583.90,0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,472.0,0,7.0,60.97,45,5,0.0,0,0
14919,24919,1,0,0.0,2019-07-18,2019-07-18,2019-07-01,2019-12-31,2019-10-31,0,201.20,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,7,2.0,17.14,7,1,0.0,0,0
14926,24926,24,0,3290.0,2018-10-08,2018-10-08,2019-01-01,2019-12-31,2019-10-29,2,196.10,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,36.0,37,2.0,17.42,7,1,0.0,0,0
14937,24937,14,1,0.0,2016-09-08,2016-09-08,2019-07-01,2019-12-31,2019-10-29,2,1460.45,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,112.0,96,4.0,34.84,8,3,0.0,0,0


In [32]:
#when is not a dropout, change to retrieval date
sports_data.loc[sports_data['Dropout'] == 0, 'EnrollmentFinish'] = pd.to_datetime('2019-12-31')

sports_data[sports_data['Dropout'] == 0]['EnrollmentFinish'].value_counts()

EnrollmentFinish
2019-12-31    2970
Name: count, dtype: int64

In [28]:
#Cases in which the client is not a dropout, but the days without frequency is bigger than 0

sports_data.loc[(sports_data['DaysWithoutFrequency'] != 0) & (sports_data['Dropout'] == 0), 'DaysWithoutFrequency'] = 0

sports_data[(sports_data['DaysWithoutFrequency'] != 0) & (sports_data['Dropout'] == 0)]


Unnamed: 0,ID,Age,Gender,Income,EnrollmentStart,EnrollmentFinish,LastPeriodStart,LastPeriodFinish,DateLastVisit,DaysWithoutFrequency,LifetimeValue,UseByTime,AthleticsActivities,WaterActivities,FitnessActivities,DanceActivities,TeamActivities,RacketActivities,CombatActivities,NatureActivities,SpecialActivities,OtherActivities,NumberOfFrequencies,AttendedClasses,AllowedWeeklyVisitsBySLA,AllowedNumberOfVisitsBySLA,RealNumberOfVisits,NumberOfRenewals,HasReferences,NumberOfReferences,Dropout


In [29]:
sports_data['testedates'] = sports_data['EnrollmentFinish'] - sports_data['DateLastVisit']

sports_data['testedates'] = sports_data['testedates'].astype(str).str.replace('days', '').str.strip()

sports_data['testedates'] = pd.to_numeric(sports_data['testedates'], errors='coerce').astype('Int64')
#wait for corrected version to finish testing
sports_data[sports_data['testedates'] != sports_data['DaysWithoutFrequency']][['testedates', 'DaysWithoutFrequency']]

Unnamed: 0,testedates,DaysWithoutFrequency
0,62,0
19,71,0
25,66,0
34,81,0
36,70,0
...,...,...
14918,61,0
14919,61,0
14926,63,0
14937,63,0


<h2>Feature Engineering

In [30]:
sports_data['RelativePercentageOfVisits'] = (sports_data['RealNumberOfVisits'] / sports_data['AllowedNumberOfVisitsBySLA']) * 100

incorencias
-missing values
-visualizacao
-imputation
-outliers


feature engineering
-data creation
-transformar categoricas em numericas

feature selection
-testar correlacoes e selecionar variaveis para drop 

- imputation no income vazio