In [17]:
import pandas as pd
import numpy as np
import csv
from ydata_profiling import ProfileReport
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [18]:
sports_data_orig = pd.read_csv('XYZ_sports_dataset.csv', delimiter =";")
sports_data = sports_data_orig

In [19]:
sports_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14942 entries, 0 to 14941
Data columns (total 31 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   ID                          14942 non-null  int64  
 1   Age                         14942 non-null  int64  
 2   Gender                      14942 non-null  object 
 3   Income                      14447 non-null  float64
 4   EnrollmentStart             14942 non-null  object 
 5   EnrollmentFinish            14942 non-null  object 
 6   LastPeriodStart             14942 non-null  object 
 7   LastPeriodFinish            14942 non-null  object 
 8   DateLastVisit               14942 non-null  object 
 9   DaysWithoutFrequency        14942 non-null  int64  
 10  LifetimeValue               14942 non-null  float64
 11  UseByTime                   14942 non-null  int64  
 12  AthleticsActivities         14906 non-null  float64
 13  WaterActivities             149

<h4>Change data types

In [20]:
sports_data['EnrollmentStart'] = pd.to_datetime(sports_data['EnrollmentStart'])
sports_data['EnrollmentFinish'] = pd.to_datetime(sports_data['EnrollmentFinish'])
sports_data['LastPeriodStart'] = pd.to_datetime(sports_data['LastPeriodStart'])
sports_data['LastPeriodFinish'] = pd.to_datetime(sports_data['LastPeriodFinish'])
sports_data['DateLastVisit'] = pd.to_datetime(sports_data['DateLastVisit'])

Binary encode of gender

In [21]:
sports_data.value_counts()

ID     Age  Gender  Income  EnrollmentStart  EnrollmentFinish  LastPeriodStart  LastPeriodFinish  DateLastVisit  DaysWithoutFrequency  LifetimeValue  UseByTime  AthleticsActivities  WaterActivities  FitnessActivities  DanceActivities  TeamActivities  RacketActivities  CombatActivities  NatureActivities  SpecialActivities  OtherActivities  NumberOfFrequencies  AttendedClasses  AllowedWeeklyVisitsBySLA  AllowedNumberOfVisitsBySLA  RealNumberOfVisits  NumberOfRenewals  HasReferences  NumberOfReferences  Dropout
10001  29   Female  2630.0  2014-08-12       2015-09-14        2015-01-01       2015-12-31        2015-07-16     60                    479.20         0          0.0                  0.0              0.0                0.0              0.0             0.0               0.0               0.0               1.0                0.0              23.0                 1                2.0                       17.42                       1                   2                 0.0            

In [22]:
sports_data['Gender'] = sports_data['Gender'].map({'Male': 1, 'Female': 0})

In [23]:
sports_data['Gender'].dtype

dtype('int64')

<h1> Data Exploration </h1>

In [24]:
# replace "" by nans
sports_data.replace("", np.nan, inplace=True)

# count of missing values
sports_data.isna().sum()

ID                              0
Age                             0
Gender                          0
Income                        495
EnrollmentStart                 0
EnrollmentFinish                0
LastPeriodStart                 0
LastPeriodFinish                0
DateLastVisit                   0
DaysWithoutFrequency            0
LifetimeValue                   0
UseByTime                       0
AthleticsActivities            36
WaterActivities                37
FitnessActivities              35
DanceActivities                36
TeamActivities                 35
RacketActivities               37
CombatActivities               33
NatureActivities               47
SpecialActivities              44
OtherActivities                35
NumberOfFrequencies            26
AttendedClasses                 0
AllowedWeeklyVisitsBySLA      535
AllowedNumberOfVisitsBySLA      0
RealNumberOfVisits              0
NumberOfRenewals                0
HasReferences                  12
NumberOfRefere

<h3> Age

In [25]:
#set income as zero when the age is under 16 
sports_data.loc[(sports_data['Age'] < 16), 'Income'] = 0

In [26]:
#only consider fitness activities for customers over 16

drop_fitness_above_16 = sports_data[(sports_data['FitnessActivities'] == 1) & (sports_data['Age'] < 16)].index
sports_data.drop(drop_fitness_above_16, inplace=True)

sports_data[(sports_data['FitnessActivities'] == 1) & (sports_data['Age'] < 16)]

Unnamed: 0,ID,Age,Gender,Income,EnrollmentStart,EnrollmentFinish,LastPeriodStart,LastPeriodFinish,DateLastVisit,DaysWithoutFrequency,LifetimeValue,UseByTime,AthleticsActivities,WaterActivities,FitnessActivities,DanceActivities,TeamActivities,RacketActivities,CombatActivities,NatureActivities,SpecialActivities,OtherActivities,NumberOfFrequencies,AttendedClasses,AllowedWeeklyVisitsBySLA,AllowedNumberOfVisitsBySLA,RealNumberOfVisits,NumberOfRenewals,HasReferences,NumberOfReferences,Dropout


In [27]:
pd.set_option('display.max_columns', None)
#Drop every child before 4 years old who is doing other activities that are not water

activities_before_4_years = (sports_data['Age'] < 4) & (
    ((sports_data['TeamActivities'] == 1) | (sports_data['SpecialActivities'] == 1) | (sports_data['CombatActivities'] == 1) |
     (sports_data['RacketActivities'] == 1) | (sports_data['AthleticsActivities'] == 1) | (sports_data['FitnessActivities'] == 1) |
     (sports_data['DanceActivities'] == 1) | (sports_data['NatureActivities'] == 1))
)

sports_data.drop(sports_data[activities_before_4_years].index, inplace=True)

In [28]:
sports_data[sports_data['EnrollmentStart'] == sports_data['EnrollmentFinish']]

Unnamed: 0,ID,Age,Gender,Income,EnrollmentStart,EnrollmentFinish,LastPeriodStart,LastPeriodFinish,DateLastVisit,DaysWithoutFrequency,LifetimeValue,UseByTime,AthleticsActivities,WaterActivities,FitnessActivities,DanceActivities,TeamActivities,RacketActivities,CombatActivities,NatureActivities,SpecialActivities,OtherActivities,NumberOfFrequencies,AttendedClasses,AllowedWeeklyVisitsBySLA,AllowedNumberOfVisitsBySLA,RealNumberOfVisits,NumberOfRenewals,HasReferences,NumberOfReferences,Dropout
25,10025,13,0,0.0,2015-09-09,2015-09-09,2019-07-01,2019-12-31,2019-10-26,5,596.30,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27.0,18,2.0,6.28,3,4,0.0,0,0
36,10036,13,0,0.0,2014-10-02,2014-10-02,2019-07-01,2019-12-31,2019-10-22,9,2209.97,0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,199.0,166,2.0,17.14,5,5,1.0,1,0
38,10038,38,0,1910.0,2018-09-29,2018-09-29,2019-07-01,2019-12-31,2019-10-30,1,554.60,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,127.0,125,3.0,26.13,25,1,0.0,0,0
49,10049,56,1,6540.0,2018-09-03,2018-09-03,2019-07-01,2019-12-31,2019-10-29,2,540.10,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,112.0,0,7.0,60.97,32,1,0.0,0,0
51,10051,12,0,0.0,2018-08-17,2018-08-17,2019-07-01,2019-12-31,2019-10-28,3,852.30,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,42.0,42,4.0,34.84,4,2,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14918,24918,73,1,3870.0,2014-12-10,2014-12-10,2019-07-01,2019-12-31,2019-10-31,0,1583.90,0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,472.0,0,7.0,60.97,45,5,0.0,0,0
14919,24919,1,0,0.0,2019-07-18,2019-07-18,2019-07-01,2019-12-31,2019-10-31,0,201.20,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,7,2.0,17.14,7,1,0.0,0,0
14926,24926,24,0,3290.0,2018-10-08,2018-10-08,2019-01-01,2019-12-31,2019-10-29,2,196.10,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,36.0,37,2.0,17.42,7,1,0.0,0,0
14937,24937,14,1,0.0,2016-09-08,2016-09-08,2019-07-01,2019-12-31,2019-10-29,2,1460.45,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,112.0,96,4.0,34.84,8,3,0.0,0,0


In [29]:
#all clients with enrollment start = enrollment finish are not dropouts, so let's change the enrollment finish to the retrieval date

sports_data.loc[sports_data['Dropout'] == 0, 'EnrollmentFinish'] = pd.to_datetime('2019-10-31')

sports_data[sports_data['Dropout'] == 0]['EnrollmentFinish'].value_counts()

EnrollmentFinish
2019-10-31    2970
Name: count, dtype: int64

In [30]:
#Cases in which the client is not a dropout, but the days without frequency is bigger than 0

sports_data.loc[(sports_data['DaysWithoutFrequency'] != 0) & (sports_data['Dropout'] == 0), 'DaysWithoutFrequency'] = 0

sports_data[(sports_data['DaysWithoutFrequency'] != 0) & (sports_data['Dropout'] == 0)]


Unnamed: 0,ID,Age,Gender,Income,EnrollmentStart,EnrollmentFinish,LastPeriodStart,LastPeriodFinish,DateLastVisit,DaysWithoutFrequency,LifetimeValue,UseByTime,AthleticsActivities,WaterActivities,FitnessActivities,DanceActivities,TeamActivities,RacketActivities,CombatActivities,NatureActivities,SpecialActivities,OtherActivities,NumberOfFrequencies,AttendedClasses,AllowedWeeklyVisitsBySLA,AllowedNumberOfVisitsBySLA,RealNumberOfVisits,NumberOfRenewals,HasReferences,NumberOfReferences,Dropout


In [31]:
sports_data['DaysWithoutFrequency'] = sports_data['EnrollmentFinish'] - sports_data['DateLastVisit']

In [32]:
#bin_edges = [0, 15, 25, 35, 45, 64, 65]  
#bin_labels = ['0-15', '15-25', '26-35', '36-45', '45-64', '65+']

# Create a new column 'age_group' with the assigned bins
#sports_data['Age'] = pd.cut(sports_data['Age'], bins=bin_edges, labels=bin_labels, right=False)

#sports_data['Age']

<h2>Feature Engineering

Real number of visits in relation to the allowed number of visits

In [33]:
sports_data['PercentageOfVisits'] = (sports_data['RealNumberOfVisits'] / sports_data['AllowedNumberOfVisitsBySLA']) * 100

Number of Activities the client is signed in

In [34]:
sports_data['NumberOfActivities'] = sports_data.iloc[:, 12:22].sum(axis=1)

In [35]:
sports_data[sports_data['NumberOfActivities'] == 0]

Unnamed: 0,ID,Age,Gender,Income,EnrollmentStart,EnrollmentFinish,LastPeriodStart,LastPeriodFinish,DateLastVisit,DaysWithoutFrequency,LifetimeValue,UseByTime,AthleticsActivities,WaterActivities,FitnessActivities,DanceActivities,TeamActivities,RacketActivities,CombatActivities,NatureActivities,SpecialActivities,OtherActivities,NumberOfFrequencies,AttendedClasses,AllowedWeeklyVisitsBySLA,AllowedNumberOfVisitsBySLA,RealNumberOfVisits,NumberOfRenewals,HasReferences,NumberOfReferences,Dropout,PercentageOfVisits,NumberOfActivities
333,10333,42,1,4720.0,2016-11-02,2019-10-31,2019-07-01,2019-12-31,2019-10-30,1 days,885.8,0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,78.0,63,1.0,8.71,3,3,0.0,0,0,34.443169,0.0
415,10415,53,0,3290.0,2017-10-07,2019-01-14,2018-07-01,2019-06-30,2018-12-14,31 days,264.5,0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,5,7.0,42.0,4,1,0.0,0,1,9.52381,0.0
682,10682,22,0,2250.0,2014-10-08,2015-09-16,2015-01-01,2015-06-30,2015-06-04,104 days,144.45,0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67.0,0,7.0,28.98,11,1,0.0,0,1,37.957212,0.0
1046,11046,14,1,0.0,2015-01-29,2018-07-31,2018-01-01,2018-12-31,2018-05-28,64 days,1420.9,0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.0,90,2.0,17.42,0,4,0.0,0,1,0.0,0.0
1443,11443,42,0,5230.0,2018-03-28,2018-07-31,2018-01-01,2018-12-31,2018-07-30,1 days,93.4,0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,28.0,0,7.0,60.97,18,0,0.0,0,1,29.522716,0.0
2964,12964,15,0,0.0,2017-06-16,2019-10-31,2019-07-01,2019-12-31,2019-03-26,219 days,861.2,0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,13.0,13,2.0,17.42,0,3,0.0,0,0,0.0,0.0
3735,13735,21,0,1200.0,2015-12-29,2016-03-01,2016-01-01,2016-06-30,2016-02-25,5 days,95.6,0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23.0,0,7.0,59.99,25,0,0.0,0,1,41.673612,0.0
4085,14085,22,1,2100.0,2015-04-16,2015-09-16,2015-01-01,2015-06-30,2015-04-28,141 days,37.6,0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0,7.0,28.98,2,1,0.0,0,1,6.901311,0.0
4619,14619,24,0,1940.0,2017-11-06,2018-05-08,2018-01-01,2018-06-30,2018-03-19,50 days,123.6,0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.0,0,7.0,60.97,2,0,0.0,0,1,3.280302,0.0
5013,15013,19,1,1090.0,2016-01-06,2016-03-06,2016-01-01,2016-06-30,2016-01-20,46 days,47.6,0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0,7.0,59.99,4,0,0.0,0,1,6.667778,0.0


Se tiver NaN nas atividades, e zero na soma, imputar o NaN como 1

Monthly paid value

In [36]:
sports_data['TotalMonths'] = (sports_data['EnrollmentFinish'] - sports_data['EnrollmentStart']) // np.timedelta64(1, 'M')
sports_data['TotalMonths'] = np.where(sports_data['TotalMonths'] <= 0, 1, sports_data['TotalMonths']) #cases in which is less than one month, we will assume one month

sports_data['MonthlyValue'] = sports_data['LifetimeValue'] / sports_data['TotalMonths']

sports_data.drop('TotalMonths', axis=1) #drop total months column since we only needed it for this code


Unnamed: 0,ID,Age,Gender,Income,EnrollmentStart,EnrollmentFinish,LastPeriodStart,LastPeriodFinish,DateLastVisit,DaysWithoutFrequency,LifetimeValue,UseByTime,AthleticsActivities,WaterActivities,FitnessActivities,DanceActivities,TeamActivities,RacketActivities,CombatActivities,NatureActivities,SpecialActivities,OtherActivities,NumberOfFrequencies,AttendedClasses,AllowedWeeklyVisitsBySLA,AllowedNumberOfVisitsBySLA,RealNumberOfVisits,NumberOfRenewals,HasReferences,NumberOfReferences,Dropout,PercentageOfVisits,NumberOfActivities,MonthlyValue
0,10000,60,0,5500.0,2019-09-03,2019-10-31,2019-07-01,2019-12-31,2019-10-30,1 days,89.35,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,7,,6.28,2,0,0.0,0,0,31.847134,1.0,89.350000
1,10001,29,0,2630.0,2014-08-12,2015-09-14,2015-01-01,2015-12-31,2015-07-16,60 days,479.20,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,23.0,1,2.0,17.42,1,2,0.0,0,1,5.740528,1.0,39.933333
2,10002,23,1,1980.0,2017-05-02,2017-06-01,2017-01-01,2017-06-30,2017-05-25,7 days,37.60,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0,7.0,30.03,6,0,0.0,0,1,19.980020,1.0,37.600000
3,10003,9,1,0.0,2018-09-05,2019-02-12,2018-07-01,2019-06-30,2019-01-21,22 days,155.40,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,20.0,2,2.0,17.72,3,0,0.0,0,1,16.930023,1.0,31.080000
4,10004,35,1,4320.0,2016-04-20,2018-06-07,2018-01-01,2018-06-30,2017-11-09,210 days,373.20,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,41.0,0,7.0,60.97,0,3,0.0,0,1,0.000000,1.0,14.928000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14937,24937,14,1,0.0,2016-09-08,2019-10-31,2019-07-01,2019-12-31,2019-10-29,2 days,1460.45,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,112.0,96,4.0,34.84,8,3,0.0,0,0,22.962113,1.0,39.471622
14938,24938,39,1,,2015-09-17,2016-06-04,2016-01-01,2016-06-30,2016-04-27,38 days,343.85,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,57.0,0,7.0,60.97,3,0,0.0,0,1,4.920453,1.0,42.981250
14939,24939,20,1,1810.0,2017-03-01,2017-03-31,2017-01-01,2017-06-30,2017-03-29,2 days,43.60,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0,7.0,30.03,8,0,0.0,0,1,26.640027,1.0,43.600000
14940,24940,55,1,4800.0,2018-03-01,2019-10-31,2019-07-01,2019-12-31,2019-10-28,3 days,788.60,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,185.0,155,3.0,15.87,7,2,0.0,0,0,44.108381,1.0,41.505263


Percentage of visits that were classes

In [37]:
sports_data['PercentageOfClasses'] = sports_data['AttendedClasses'] / sports_data['NumberOfFrequencies'] * 100

In [38]:
sports_data.loc[(sports_data['HasReferences'] == 1) & (sports_data['NumberOfReferences'] == 0), 'HasReferences'] = 0

crianças também podem ter special e  other activities.

In [39]:
sports_data[sports_data['Age'] < 3]['OtherActivities'].value_counts()

OtherActivities
0.0    379
Name: count, dtype: int64

<h2>Identifying missing values in binary categories

In [42]:
sports_data[sports_data['UseByTime'].isna()]

Unnamed: 0,ID,Age,Gender,Income,EnrollmentStart,EnrollmentFinish,LastPeriodStart,LastPeriodFinish,DateLastVisit,DaysWithoutFrequency,LifetimeValue,UseByTime,AthleticsActivities,WaterActivities,FitnessActivities,DanceActivities,TeamActivities,RacketActivities,CombatActivities,NatureActivities,SpecialActivities,OtherActivities,NumberOfFrequencies,AttendedClasses,AllowedWeeklyVisitsBySLA,AllowedNumberOfVisitsBySLA,RealNumberOfVisits,NumberOfRenewals,HasReferences,NumberOfReferences,Dropout,PercentageOfVisits,NumberOfActivities,TotalMonths,MonthlyValue,PercentageOfClasses


In [43]:
sports_data[sports_data['AthleticsActivities'].isna()]

Unnamed: 0,ID,Age,Gender,Income,EnrollmentStart,EnrollmentFinish,LastPeriodStart,LastPeriodFinish,DateLastVisit,DaysWithoutFrequency,LifetimeValue,UseByTime,AthleticsActivities,WaterActivities,FitnessActivities,DanceActivities,TeamActivities,RacketActivities,CombatActivities,NatureActivities,SpecialActivities,OtherActivities,NumberOfFrequencies,AttendedClasses,AllowedWeeklyVisitsBySLA,AllowedNumberOfVisitsBySLA,RealNumberOfVisits,NumberOfRenewals,HasReferences,NumberOfReferences,Dropout,PercentageOfVisits,NumberOfActivities,TotalMonths,MonthlyValue,PercentageOfClasses
229,10229,35,1,3740.0,2017-03-24,2018-07-31,2018-01-01,2018-12-31,2018-07-25,6 days,475.0,0,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,59.0,19,1.0,8.71,8,2,0.0,0,1,91.84845,1.0,15,31.666667,32.20339
1480,11480,20,0,1790.0,2019-04-18,2019-07-09,2019-01-01,2019-12-31,2019-06-26,13 days,70.8,0,,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0,7.0,60.97,7,0,0.0,0,1,11.481056,1.0,2,35.4,0.0
1650,11650,25,0,2430.0,2015-10-07,2016-02-04,2015-07-01,2016-06-30,2015-12-14,52 days,122.6,0,,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,12.0,0,7.0,62.02,2,0,0.0,0,1,3.224766,1.0,3,40.866667,0.0
1961,11961,21,0,1460.0,2019-09-25,2019-10-31,2019-07-01,2019-12-31,2019-10-30,1 days,45.0,0,,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0,,21.98,3,0,0.0,0,0,13.648772,1.0,1,45.0,0.0
2050,12050,25,0,2340.0,2015-03-04,2017-04-13,2017-01-01,2017-06-30,2017-03-28,16 days,236.6,0,,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,29.0,0,7.0,30.03,4,3,0.0,0,1,13.320013,1.0,24,9.858333,0.0
2062,12062,17,0,1380.0,2016-10-17,2017-04-07,2017-01-01,2017-06-30,2017-02-06,60 days,230.6,0,,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,8.0,0,7.0,59.01,0,0,0.0,0,1,0.0,1.0,5,46.12,0.0
2192,12192,39,0,4670.0,2016-10-11,2017-08-01,2017-01-01,2017-12-31,2017-03-21,133 days,272.0,0,,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,41.0,0,7.0,60.97,0,2,0.0,0,1,0.0,1.0,9,30.222222,0.0
2918,12918,25,0,1920.0,2017-01-03,2018-07-31,2018-01-01,2018-12-31,2018-07-27,4 days,324.0,0,,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,116.0,0,7.0,60.97,6,2,0.0,0,1,9.840905,1.0,18,18.0,0.0
3896,13896,13,0,0.0,2016-05-19,2016-07-31,2016-01-01,2016-12-31,2016-07-21,10 days,109.1,0,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0,2.0,17.42,3,0,0.0,0,1,17.221584,1.0,2,54.55,0.0
3999,13999,21,0,1470.0,2019-03-09,2019-10-31,2019-01-01,2019-06-30,2019-05-23,161 days,86.5,0,,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0,7.0,60.97,3,1,0.0,0,0,4.920453,1.0,7,12.357143,0.0


In [44]:
sports_data[sports_data['WaterActivities'].isna()]

Unnamed: 0,ID,Age,Gender,Income,EnrollmentStart,EnrollmentFinish,LastPeriodStart,LastPeriodFinish,DateLastVisit,DaysWithoutFrequency,LifetimeValue,UseByTime,AthleticsActivities,WaterActivities,FitnessActivities,DanceActivities,TeamActivities,RacketActivities,CombatActivities,NatureActivities,SpecialActivities,OtherActivities,NumberOfFrequencies,AttendedClasses,AllowedWeeklyVisitsBySLA,AllowedNumberOfVisitsBySLA,RealNumberOfVisits,NumberOfRenewals,HasReferences,NumberOfReferences,Dropout,PercentageOfVisits,NumberOfActivities,TotalMonths,MonthlyValue,PercentageOfClasses
333,10333,42,1,4720.0,2016-11-02,2019-10-31,2019-07-01,2019-12-31,2019-10-30,1 days,885.8,0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,78.0,63,1.0,8.71,3,3,0.0,0,0,34.443169,0.0,35,25.308571,80.769231
415,10415,53,0,3290.0,2017-10-07,2019-01-14,2018-07-01,2019-06-30,2018-12-14,31 days,264.5,0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,5,7.0,42.0,4,1,0.0,0,1,9.52381,0.0,14,18.892857,55.555556
642,10642,21,1,1320.0,2017-03-23,2018-06-07,2018-01-01,2018-06-30,2018-04-16,52 days,84.0,0,0.0,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.0,0,7.0,60.97,2,2,0.0,0,1,3.280302,1.0,14,6.0,0.0
780,10780,24,0,2300.0,2019-02-28,2019-10-31,2019-07-01,2019-12-31,2019-10-29,2 days,214.2,0,0.0,,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,75.0,0,7.0,35.0,21,1,0.0,0,0,60.0,1.0,7,30.6,0.0
1046,11046,14,1,0.0,2015-01-29,2018-07-31,2018-01-01,2018-12-31,2018-05-28,64 days,1420.9,0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.0,90,2.0,17.42,0,4,0.0,0,1,0.0,0.0,41,34.656098,90.909091
1467,11467,18,0,1310.0,2019-10-15,2019-10-31,2019-07-01,2019-12-31,2019-10-29,2 days,47.2,0,0.0,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0,,16.03,3,0,0.0,0,0,18.71491,1.0,1,47.2,0.0
1701,11701,26,1,2460.0,2017-02-02,2018-01-22,2017-07-01,2018-06-30,2017-11-30,53 days,248.95,0,0.0,,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,41.0,0,7.0,60.97,7,2,0.0,0,1,11.481056,2.0,11,22.631818,0.0
1919,11919,19,1,1280.0,2016-03-31,2016-06-30,2016-01-01,2016-06-30,2016-06-04,26 days,91.0,0,0.0,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.0,0,7.0,60.97,8,0,0.0,0,1,13.121207,1.0,2,45.5,0.0
3147,13147,46,0,3760.0,2016-03-02,2018-07-31,2018-01-01,2018-12-31,2018-07-25,6 days,806.2,0,0.0,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,115.0,0,7.0,60.97,24,3,0.0,0,1,39.363621,1.0,28,28.792857,0.0
3222,13222,25,1,1110.0,2016-02-29,2018-01-07,2017-07-01,2018-06-30,2017-11-30,38 days,297.2,0,0.0,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,91.0,0,7.0,60.97,12,3,0.0,0,1,19.681811,1.0,21,14.152381,0.0


In [45]:
sports_data[sports_data['FitnessActivities'].isna()]

Unnamed: 0,ID,Age,Gender,Income,EnrollmentStart,EnrollmentFinish,LastPeriodStart,LastPeriodFinish,DateLastVisit,DaysWithoutFrequency,LifetimeValue,UseByTime,AthleticsActivities,WaterActivities,FitnessActivities,DanceActivities,TeamActivities,RacketActivities,CombatActivities,NatureActivities,SpecialActivities,OtherActivities,NumberOfFrequencies,AttendedClasses,AllowedWeeklyVisitsBySLA,AllowedNumberOfVisitsBySLA,RealNumberOfVisits,NumberOfRenewals,HasReferences,NumberOfReferences,Dropout,PercentageOfVisits,NumberOfActivities,TotalMonths,MonthlyValue,PercentageOfClasses
512,10512,60,0,5620.0,2014-12-10,2016-07-08,2016-01-01,2016-12-31,2016-05-02,67 days,545.8,0,0.0,1.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,70.0,0,7.0,60.97,0,1,0.0,0,1,0.0,1.0,18,30.322222,0.0
682,10682,22,0,2250.0,2014-10-08,2015-09-16,2015-01-01,2015-06-30,2015-06-04,104 days,144.45,0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67.0,0,7.0,28.98,11,1,0.0,0,1,37.957212,0.0,11,13.131818,0.0
1058,11058,22,1,2070.0,2018-03-19,2019-10-31,2019-07-01,2019-12-31,2019-10-18,13 days,294.2,0,0.0,0.0,,0.0,0.0,0.0,1.0,0.0,0.0,0.0,60.0,0,7.0,30.03,4,2,0.0,0,0,13.320013,1.0,19,15.484211,0.0
1127,11127,51,1,5270.0,2017-05-08,2018-04-01,2018-01-01,2018-06-30,2018-03-15,17 days,412.6,0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,1.0,0.0,98.0,3,7.0,59.01,11,2,0.0,0,1,18.640908,1.0,10,41.26,3.061224
1443,11443,42,0,5230.0,2018-03-28,2018-07-31,2018-01-01,2018-12-31,2018-07-30,1 days,93.4,0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,28.0,0,7.0,60.97,18,0,0.0,0,1,29.522716,0.0,4,23.35,0.0
3307,13307,23,1,2030.0,2014-10-03,2015-09-16,2014-07-01,2014-12-31,2014-11-01,319 days,123.2,0,0.0,0.0,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3.0,3,2.0,17.42,2,1,0.0,0,1,11.481056,1.0,11,11.2,100.0
3457,13457,11,1,0.0,2015-10-03,2015-12-05,2015-07-01,2015-12-31,2015-10-24,42 days,68.6,0,0.0,0.0,,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1,1.0,8.71,1,0,0.0,0,1,11.481056,1.0,2,34.3,100.0
4085,14085,22,1,2100.0,2015-04-16,2015-09-16,2015-01-01,2015-06-30,2015-04-28,141 days,37.6,0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0,7.0,28.98,2,1,0.0,0,1,6.901311,0.0,4,9.4,0.0
4289,14289,57,1,7010.0,2019-01-02,2019-10-31,2019-07-01,2019-12-31,2019-10-25,6 days,250.2,0,0.0,1.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.0,11,1.0,1.86,2,1,0.0,0,0,107.526882,1.0,9,27.8,50.0
4619,14619,24,0,1940.0,2017-11-06,2018-05-08,2018-01-01,2018-06-30,2018-03-19,50 days,123.6,0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.0,0,7.0,60.97,2,0,0.0,0,1,3.280302,0.0,5,24.72,0.0


In [46]:
sports_data[sports_data['DanceActivities'].isna()]

Unnamed: 0,ID,Age,Gender,Income,EnrollmentStart,EnrollmentFinish,LastPeriodStart,LastPeriodFinish,DateLastVisit,DaysWithoutFrequency,LifetimeValue,UseByTime,AthleticsActivities,WaterActivities,FitnessActivities,DanceActivities,TeamActivities,RacketActivities,CombatActivities,NatureActivities,SpecialActivities,OtherActivities,NumberOfFrequencies,AttendedClasses,AllowedWeeklyVisitsBySLA,AllowedNumberOfVisitsBySLA,RealNumberOfVisits,NumberOfRenewals,HasReferences,NumberOfReferences,Dropout,PercentageOfVisits,NumberOfActivities,TotalMonths,MonthlyValue,PercentageOfClasses
387,10387,19,1,390.0,2016-04-21,2016-07-31,2016-07-01,2016-12-31,2016-07-16,15 days,63.6,0,0.0,0.0,1.0,,0.0,0.0,0.0,0.0,0.0,0.0,12.0,0,7.0,24.99,3,0,0.0,0,1,12.004802,1.0,3,21.2,0.0
738,10738,44,1,4390.0,2019-06-28,2019-10-31,2019-07-01,2019-12-31,2019-10-07,24 days,113.1,0,0.0,0.0,1.0,,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0,7.0,48.02,2,1,0.0,0,0,4.164931,1.0,4,28.275,0.0
861,10861,21,0,1850.0,2014-10-09,2015-09-16,2015-01-01,2015-06-30,2015-05-27,112 days,151.2,0,0.0,0.0,1.0,,0.0,0.0,0.0,0.0,0.0,0.0,30.0,0,7.0,28.98,1,1,0.0,0,1,3.450656,1.0,11,13.745455,0.0
1499,11499,71,1,2870.0,2019-10-02,2019-10-31,2019-07-01,2019-12-31,2019-10-30,1 days,65.2,0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,1.0,0.0,9.0,0,,28.98,9,0,0.0,0,0,31.055901,1.0,1,65.2,0.0
1965,11965,2,1,0.0,2014-11-15,2015-09-16,2015-01-01,2015-12-31,2015-06-06,102 days,369.7,0,0.0,1.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,14.0,11,1.0,8.71,1,1,0.0,0,1,11.481056,1.0,9,41.077778,78.571429
2169,12169,60,1,6670.0,2014-09-01,2015-09-14,2015-01-01,2015-12-31,2014-12-19,269 days,479.2,0,0.0,1.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,22.0,20,3.0,26.13,0,1,0.0,0,1,0.0,1.0,12,39.933333,90.909091
2377,12377,22,0,,2015-11-27,2016-07-08,2016-01-01,2016-12-31,2016-05-30,39 days,173.6,0,0.0,0.0,1.0,,0.0,0.0,0.0,0.0,0.0,0.0,25.0,0,7.0,59.01,2,0,0.0,0,1,3.389256,1.0,7,24.8,0.0
2639,12639,24,1,2470.0,2018-01-26,2018-04-07,2018-01-01,2018-06-30,2018-02-16,50 days,56.8,0,0.0,0.0,0.0,,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0,7.0,59.01,1,0,0.0,0,1,1.694628,1.0,2,28.4,0.0
2915,12915,7,1,0.0,2018-09-15,2018-11-11,2018-07-01,2018-12-31,2018-10-30,12 days,69.4,0,0.0,1.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,5.0,4,4.0,23.44,5,0,0.0,0,1,21.331058,1.0,1,69.4,80.0
2959,12959,24,1,1840.0,2017-10-09,2018-06-07,2018-01-01,2018-06-30,2018-04-06,62 days,188.6,0,0.0,0.0,1.0,,0.0,0.0,0.0,0.0,0.0,0.0,27.0,0,7.0,60.97,0,0,0.0,0,1,0.0,1.0,7,26.942857,0.0


In [47]:
sports_data[sports_data['TeamActivities'].isna()]

Unnamed: 0,ID,Age,Gender,Income,EnrollmentStart,EnrollmentFinish,LastPeriodStart,LastPeriodFinish,DateLastVisit,DaysWithoutFrequency,LifetimeValue,UseByTime,AthleticsActivities,WaterActivities,FitnessActivities,DanceActivities,TeamActivities,RacketActivities,CombatActivities,NatureActivities,SpecialActivities,OtherActivities,NumberOfFrequencies,AttendedClasses,AllowedWeeklyVisitsBySLA,AllowedNumberOfVisitsBySLA,RealNumberOfVisits,NumberOfRenewals,HasReferences,NumberOfReferences,Dropout,PercentageOfVisits,NumberOfActivities,TotalMonths,MonthlyValue,PercentageOfClasses
435,10435,20,0,2060.0,2015-03-25,2015-09-16,2015-01-01,2015-06-30,2015-04-27,142 days,37.6,0,0.0,0.0,1.0,0.0,,0.0,0.0,0.0,0.0,0.0,4.0,0,7.0,28.98,4,1,0.0,0,1,13.802622,1.0,5,7.52,0.0
1073,11073,7,0,0.0,2016-11-05,2019-07-09,2019-01-01,2019-12-31,2019-04-26,74 days,783.8,0,0.0,1.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,52.0,11,2.0,17.42,0,2,0.0,0,1,0.0,1.0,31,25.283871,21.153846
2688,12688,26,1,2630.0,2015-10-20,2016-04-04,2016-01-01,2016-06-30,2015-10-28,159 days,83.0,0,0.0,0.0,1.0,0.0,,0.0,0.0,0.0,0.0,0.0,3.0,0,7.0,59.99,0,0,0.0,0,1,0.0,1.0,5,16.6,0.0
2898,12898,36,0,4400.0,2015-08-21,2019-01-14,2018-07-01,2019-06-30,2018-12-26,19 days,1338.2,0,0.0,1.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,147.0,147,2.0,17.42,11,5,0.0,0,1,63.145809,1.0,40,33.455,100.0
2944,12944,23,0,2060.0,2016-04-15,2016-06-04,2016-01-01,2016-06-30,2016-04-20,45 days,28.6,0,0.0,0.0,1.0,0.0,,0.0,0.0,0.0,0.0,0.0,2.0,0,7.0,45.99,2,0,0.0,0,1,4.348771,1.0,1,28.6,0.0
2964,12964,15,0,0.0,2017-06-16,2019-10-31,2019-07-01,2019-12-31,2019-03-26,219 days,861.2,0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,13.0,13,2.0,17.42,0,3,0.0,0,0,0.0,0.0,27,31.896296,100.0
3204,13204,22,0,1530.0,2015-09-14,2016-06-06,2016-01-01,2016-06-30,2015-11-30,189 days,95.6,0,0.0,0.0,1.0,0.0,,0.0,0.0,0.0,0.0,0.0,18.0,0,7.0,60.97,0,0,0.0,0,1,0.0,1.0,8,11.95,0.0
3716,13716,19,0,1200.0,2014-10-15,2015-09-16,2014-07-01,2014-12-31,2014-11-28,292 days,83.2,0,0.0,0.0,1.0,0.0,,0.0,0.0,0.0,0.0,0.0,21.0,0,7.0,28.98,4,1,0.0,0,1,13.802622,1.0,10,8.32,0.0
3776,13776,22,0,1770.0,2017-04-04,2019-05-11,2019-01-01,2019-06-30,2019-04-09,32 days,99.5,0,0.0,0.0,1.0,0.0,,0.0,0.0,0.0,0.0,0.0,11.0,1,7.0,60.97,6,3,0.0,0,1,9.840905,1.0,24,4.145833,9.090909
3985,13985,29,1,2900.0,2018-09-21,2019-10-31,2019-07-01,2019-12-31,2019-10-09,22 days,233.6,0,0.0,1.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,24.0,19,1.0,3.14,1,1,0.0,0,0,31.847134,1.0,13,17.969231,79.166667


In [48]:
sports_data[sports_data['RacketActivities'].isna()]

Unnamed: 0,ID,Age,Gender,Income,EnrollmentStart,EnrollmentFinish,LastPeriodStart,LastPeriodFinish,DateLastVisit,DaysWithoutFrequency,LifetimeValue,UseByTime,AthleticsActivities,WaterActivities,FitnessActivities,DanceActivities,TeamActivities,RacketActivities,CombatActivities,NatureActivities,SpecialActivities,OtherActivities,NumberOfFrequencies,AttendedClasses,AllowedWeeklyVisitsBySLA,AllowedNumberOfVisitsBySLA,RealNumberOfVisits,NumberOfRenewals,HasReferences,NumberOfReferences,Dropout,PercentageOfVisits,NumberOfActivities,TotalMonths,MonthlyValue,PercentageOfClasses
240,10240,40,0,3070.0,2015-01-07,2015-12-05,2015-07-01,2015-12-31,2015-10-20,46 days,157.2,0,0.0,0.0,1.0,0.0,0.0,,0.0,0.0,0.0,0.0,6.0,0,7.0,45.99,1,1,0.0,0,1,2.174386,1.0,10,15.72,0.0
472,10472,24,1,2380.0,2018-02-07,2018-07-08,2018-01-01,2018-12-31,2018-05-29,40 days,125.4,0,0.0,0.0,1.0,0.0,0.0,,0.0,0.0,0.0,0.0,48.0,0,7.0,60.97,12,0,0.0,0,1,19.681811,1.0,4,31.35,0.0
809,10809,20,0,1480.0,2018-02-26,2018-06-07,2018-01-01,2018-06-30,2018-04-05,63 days,75.4,0,0.0,1.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,6.0,6,2.0,17.42,0,0,0.0,0,1,0.0,1.0,3,25.133333,100.0
1014,11014,23,1,2540.0,2014-10-09,2015-09-16,2015-01-01,2015-12-31,2015-03-26,174 days,172.0,0,0.0,0.0,1.0,0.0,0.0,,0.0,0.0,0.0,0.0,69.0,0,7.0,60.97,0,1,0.0,0,1,0.0,1.0,11,15.636364,0.0
3388,13388,15,0,0.0,2014-09-29,2015-11-05,2015-07-01,2015-12-31,2015-10-01,35 days,358.8,0,0.0,1.0,0.0,0.0,1.0,,0.0,0.0,0.0,0.0,16.0,13,7.0,51.03,3,1,0.0,0,1,5.878895,2.0,12,29.9,81.25
3865,13865,41,1,3260.0,2017-01-24,2017-08-29,2017-01-01,2017-12-31,2017-03-08,174 days,169.85,0,0.0,1.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,9.0,0,7.0,90.02,0,0,0.0,0,1,0.0,1.0,7,24.264286,0.0
4131,14131,57,1,4450.0,2014-10-15,2015-09-16,2015-01-01,2015-12-31,2015-06-17,91 days,369.2,0,0.0,1.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,10.0,7,1.0,8.71,2,1,0.0,0,1,22.962113,1.0,10,36.92,70.0
4570,14570,52,0,5290.0,2019-10-07,2019-10-31,2019-07-01,2019-12-31,2019-10-31,0 days,51.95,0,0.0,0.0,1.0,0.0,0.0,,0.0,0.0,0.0,0.0,7.0,0,,24.01,6,0,0.0,0,0,24.989588,1.0,1,51.95,0.0
4600,14600,25,0,1900.0,2018-10-02,2019-01-14,2018-07-01,2019-06-30,2018-12-14,31 days,112.4,0,0.0,0.0,0.0,0.0,1.0,,0.0,0.0,0.0,0.0,18.0,0,7.0,60.97,24,0,0.0,0,1,39.363621,1.0,3,37.466667,0.0
5913,15913,61,0,6300.0,2016-08-30,2019-10-31,2019-07-01,2019-12-31,2019-09-28,33 days,1149.8,0,0.0,0.0,1.0,0.0,0.0,,0.0,0.0,0.0,0.0,21.0,20,4.0,28.0,3,4,0.0,0,0,10.714286,1.0,37,31.075676,95.238095


In [49]:
sports_data[sports_data['CombatActivities'].isna()]

Unnamed: 0,ID,Age,Gender,Income,EnrollmentStart,EnrollmentFinish,LastPeriodStart,LastPeriodFinish,DateLastVisit,DaysWithoutFrequency,LifetimeValue,UseByTime,AthleticsActivities,WaterActivities,FitnessActivities,DanceActivities,TeamActivities,RacketActivities,CombatActivities,NatureActivities,SpecialActivities,OtherActivities,NumberOfFrequencies,AttendedClasses,AllowedWeeklyVisitsBySLA,AllowedNumberOfVisitsBySLA,RealNumberOfVisits,NumberOfRenewals,HasReferences,NumberOfReferences,Dropout,PercentageOfVisits,NumberOfActivities,TotalMonths,MonthlyValue,PercentageOfClasses
1152,11152,18,1,1180.0,2018-02-02,2018-07-31,2018-01-01,2018-12-31,2018-05-14,78 days,180.4,1,0.0,0.0,1.0,0.0,0.0,0.0,,0.0,0.0,0.0,42.0,0,7.0,60.97,2,0,0.0,0,1,3.280302,1.0,5,36.08,0.0
1194,11194,9,1,0.0,2016-09-29,2019-10-31,2019-07-01,2019-12-31,2019-10-30,1 days,1088.3,0,0.0,1.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,78.0,56,2.0,17.42,5,3,0.0,0,0,28.702641,1.0,36,30.230556,71.794872
1657,11657,21,0,1790.0,2016-10-01,2018-07-31,2018-01-01,2018-12-31,2018-06-28,33 days,538.7,0,0.0,0.0,1.0,0.0,0.0,0.0,,0.0,0.0,0.0,39.0,0,7.0,66.99,6,1,0.0,0,1,8.956561,1.0,21,25.652381,0.0
2472,12472,55,0,5030.0,2014-08-05,2015-09-14,2014-07-01,2014-12-31,2014-11-06,312 days,118.2,0,0.0,0.0,1.0,0.0,0.0,0.0,,0.0,0.0,0.0,14.0,0,7.0,28.98,6,2,0.0,0,1,20.703934,1.0,13,9.092308,0.0
2484,12484,22,1,1680.0,2015-09-14,2016-01-04,2015-07-01,2016-06-30,2015-11-21,44 days,68.6,0,0.0,0.0,1.0,0.0,0.0,0.0,,0.0,0.0,0.0,9.0,0,7.0,60.97,2,0,0.0,0,1,3.280302,1.0,3,22.866667,0.0
2795,12795,21,1,1140.0,2016-11-29,2019-07-31,2019-01-01,2019-06-30,2019-05-30,62 days,517.9,1,0.0,1.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,123.0,0,7.0,60.97,16,2,0.0,0,1,26.242414,1.0,31,16.706452,0.0
3047,13047,6,1,0.0,2017-09-12,2019-10-31,2019-07-01,2019-12-31,2019-10-29,2 days,774.2,0,0.0,1.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,59.0,41,2.0,17.72,1,2,0.0,0,0,5.643341,1.0,25,30.968,69.491525
3087,13087,35,0,3710.0,2018-04-30,2019-10-31,2019-07-01,2019-12-31,2019-10-31,0 days,608.6,0,0.0,1.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,103.0,103,2.0,6.28,5,2,0.0,0,0,79.617834,1.0,17,35.8,100.0
3105,13105,19,0,1710.0,2015-10-01,2016-07-08,2016-01-01,2016-12-31,2016-04-28,71 days,203.6,0,0.0,0.0,1.0,0.0,0.0,0.0,,0.0,0.0,0.0,32.0,0,7.0,60.97,0,0,0.0,0,1,0.0,1.0,9,22.622222,0.0
3137,13137,15,1,0.0,2018-10-15,2019-06-11,2019-01-01,2019-06-30,2019-05-29,13 days,303.4,0,0.0,1.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,39.0,36,4.0,34.84,14,0,0.0,0,1,40.183697,1.0,7,43.342857,92.307692


In [50]:
sports_data[sports_data['NatureActivities'].isna()]

Unnamed: 0,ID,Age,Gender,Income,EnrollmentStart,EnrollmentFinish,LastPeriodStart,LastPeriodFinish,DateLastVisit,DaysWithoutFrequency,LifetimeValue,UseByTime,AthleticsActivities,WaterActivities,FitnessActivities,DanceActivities,TeamActivities,RacketActivities,CombatActivities,NatureActivities,SpecialActivities,OtherActivities,NumberOfFrequencies,AttendedClasses,AllowedWeeklyVisitsBySLA,AllowedNumberOfVisitsBySLA,RealNumberOfVisits,NumberOfRenewals,HasReferences,NumberOfReferences,Dropout,PercentageOfVisits,NumberOfActivities,TotalMonths,MonthlyValue,PercentageOfClasses
89,10089,20,0,1300.0,2016-04-27,2016-11-02,2016-07-01,2016-12-31,2016-10-20,13 days,82.6,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,,0.0,0.0,14.0,0,7.0,30.03,6,1,0.0,0,1,19.98002,1.0,6,13.766667,0.0
145,10145,20,1,1630.0,2019-09-25,2019-10-31,2019-07-01,2019-12-31,2019-10-21,10 days,45.1,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,,0.0,0.0,7.0,0,,21.98,6,0,0.0,0,0,27.297543,1.0,1,45.1,0.0
337,10337,21,1,2310.0,2015-09-15,2017-07-31,2017-01-01,2017-06-30,2017-03-20,133 days,257.1,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,0.0,0.0,20.0,0,7.0,60.97,2,1,0.0,0,1,3.280302,1.0,22,11.686364,0.0
492,10492,39,1,3130.0,2016-04-22,2018-01-07,2017-07-01,2018-06-30,2017-10-04,95 days,287.7,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,,0.0,0.0,34.0,0,7.0,60.97,0,3,0.0,0,1,0.0,1.0,20,14.385,0.0
1141,11141,1,1,0.0,2019-09-07,2019-10-31,2019-07-01,2019-12-31,2019-10-31,0 days,120.2,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,8.0,0,,15.42,7,0,0.0,0,0,45.39559,1.0,1,120.2,0.0
1417,11417,18,0,1080.0,2019-02-06,2019-10-31,2019-01-01,2019-06-30,2019-02-11,262 days,43.7,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,,0.0,0.0,3.0,0,7.0,34.02,3,1,0.0,0,0,8.818342,1.0,8,5.4625,0.0
2836,12836,32,0,2140.0,2017-01-09,2019-07-09,2019-01-01,2019-12-31,2019-06-18,21 days,749.65,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,144.0,0,7.0,38.01,3,3,0.0,0,1,7.89266,1.0,29,25.85,0.0
2933,12933,44,0,3560.0,2019-01-17,2019-10-31,2019-07-01,2019-12-31,2019-10-04,27 days,345.2,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,35.0,0,7.0,21.98,0,1,0.0,0,0,0.0,1.0,9,38.355556,0.0
3085,13085,2,1,0.0,2018-07-03,2019-10-31,2019-07-01,2019-12-31,2019-10-26,5 days,846.45,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,41.0,37,2.0,17.14,6,2,0.0,0,0,35.005834,1.0,15,56.43,90.243902
3102,13102,24,0,2390.0,2014-09-19,2015-09-16,2014-07-01,2014-12-31,2014-12-12,278 days,113.2,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,,0.0,0.0,30.0,0,7.0,28.98,5,1,0.0,0,1,17.253278,1.0,11,10.290909,0.0


In [51]:
sports_data[sports_data['SpecialActivities'].isna()]

Unnamed: 0,ID,Age,Gender,Income,EnrollmentStart,EnrollmentFinish,LastPeriodStart,LastPeriodFinish,DateLastVisit,DaysWithoutFrequency,LifetimeValue,UseByTime,AthleticsActivities,WaterActivities,FitnessActivities,DanceActivities,TeamActivities,RacketActivities,CombatActivities,NatureActivities,SpecialActivities,OtherActivities,NumberOfFrequencies,AttendedClasses,AllowedWeeklyVisitsBySLA,AllowedNumberOfVisitsBySLA,RealNumberOfVisits,NumberOfRenewals,HasReferences,NumberOfReferences,Dropout,PercentageOfVisits,NumberOfActivities,TotalMonths,MonthlyValue,PercentageOfClasses
979,10979,35,1,1360.0,2017-05-15,2018-07-31,2018-01-01,2018-12-31,2018-07-31,0 days,195.0,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,0.0,16.0,0,7.0,59.99,6,2,0.0,0,1,10.001667,1.0,14,13.928571,0.0
1317,11317,23,1,2200.0,2018-04-17,2018-07-08,2018-01-01,2018-12-31,2018-05-28,41 days,40.0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,0.0,6.0,0,7.0,60.97,3,0,0.0,0,1,4.920453,1.0,2,20.0,0.0
1474,11474,19,0,1040.0,2014-09-19,2016-07-31,2016-01-01,2016-12-31,2016-07-26,5 days,636.8,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,0.0,66.0,0,7.0,60.97,4,1,0.0,0,1,6.560604,1.0,21,30.32381,0.0
1546,11546,28,0,3150.0,2015-04-06,2017-09-01,2017-07-01,2017-12-31,2016-07-27,401 days,341.0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,0.0,101.0,0,7.0,4.97,0,3,0.0,0,1,0.0,1.0,28,12.178571,0.0
1664,11664,21,0,1920.0,2016-09-21,2017-08-29,2017-01-01,2017-12-31,2016-10-04,329 days,177.6,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,,0.0,3.0,0,1.0,8.71,0,2,0.0,0,1,0.0,2.0,11,16.145455,0.0
1709,11709,30,0,2180.0,2014-06-03,2015-07-31,2016-01-01,2016-06-30,2015-01-07,205 days,220.8,1,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,,0.0,2.0,0,7.0,59.99,0,2,0.0,0,1,0.0,2.0,13,16.984615,0.0
2080,12080,18,0,1500.0,2016-11-02,2016-12-02,2016-07-01,2016-12-31,2016-11-29,3 days,53.6,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,0.0,12.0,0,7.0,30.03,12,0,0.0,0,1,39.96004,1.0,1,53.6,0.0
2488,12488,30,1,2090.0,2014-12-26,2019-10-31,2019-07-01,2019-12-31,2019-10-30,1 days,196.4,1,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,,0.0,18.0,0,7.0,49.98,11,5,0.0,0,1,22.008804,2.0,57,3.445614,0.0
2592,12592,19,1,970.0,2015-03-09,2015-09-16,2015-01-01,2015-06-30,2015-05-04,135 days,67.6,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,0.0,27.0,0,7.0,28.98,11,1,0.0,0,1,37.957212,1.0,6,11.266667,0.0
3587,13587,39,1,5680.0,2018-12-06,2019-03-12,2019-01-01,2019-06-30,2019-02-26,14 days,96.05,0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,,0.0,4.0,0,7.0,32.97,2,0,0.0,0,1,6.066121,2.0,3,32.016667,0.0


In [52]:
sports_data[sports_data['OtherActivities'].isna()]

Unnamed: 0,ID,Age,Gender,Income,EnrollmentStart,EnrollmentFinish,LastPeriodStart,LastPeriodFinish,DateLastVisit,DaysWithoutFrequency,LifetimeValue,UseByTime,AthleticsActivities,WaterActivities,FitnessActivities,DanceActivities,TeamActivities,RacketActivities,CombatActivities,NatureActivities,SpecialActivities,OtherActivities,NumberOfFrequencies,AttendedClasses,AllowedWeeklyVisitsBySLA,AllowedNumberOfVisitsBySLA,RealNumberOfVisits,NumberOfRenewals,HasReferences,NumberOfReferences,Dropout,PercentageOfVisits,NumberOfActivities,TotalMonths,MonthlyValue,PercentageOfClasses
4,10004,35,1,4320.0,2016-04-20,2018-06-07,2018-01-01,2018-06-30,2017-11-09,210 days,373.2,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,41.0,0,7.0,60.97,0,3,0.0,0,1,0.0,1.0,25,14.928,0.0
1354,11354,26,0,2540.0,2015-01-29,2015-10-03,2015-01-01,2015-12-31,2015-07-31,64 days,202.6,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,47.0,0,7.0,60.97,9,1,0.0,0,1,14.761358,1.0,7,28.942857,0.0
1429,11429,24,0,2340.0,2016-05-30,2016-07-31,2016-01-01,2016-12-31,2016-07-14,17 days,73.6,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,10.0,0,7.0,59.99,12,0,0.0,0,1,20.003334,1.0,2,36.8,0.0
1727,11727,24,1,1920.0,2016-02-01,2016-07-31,2016-01-01,2016-12-31,2016-07-29,2 days,292.6,0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,,49.0,9,2.0,17.42,11,0,0.0,0,1,63.145809,2.0,5,58.52,18.367347
1876,11876,19,0,,2015-05-15,2015-12-05,2015-07-01,2015-12-31,2015-10-27,39 days,107.2,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,9.0,0,7.0,60.97,3,1,0.0,0,1,4.920453,1.0,6,17.866667,0.0
2119,12119,24,0,1690.0,2019-02-15,2019-07-09,2019-01-01,2019-12-31,2019-07-01,8 days,145.2,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,68.0,0,7.0,56.98,27,0,0.0,0,1,47.385047,1.0,4,36.3,0.0
2280,12280,18,0,1580.0,2017-10-16,2017-12-08,2017-07-01,2017-12-31,2017-10-16,53 days,30.0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,1.0,0,7.0,52.99,1,0,0.0,0,1,1.887149,1.0,1,30.0,0.0
2457,12457,33,1,3820.0,2015-03-13,2015-09-16,2015-01-01,2015-06-30,2015-04-14,155 days,43.6,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,8.0,0,7.0,28.98,7,1,0.0,0,1,24.154589,1.0,6,7.266667,0.0
2643,12643,26,0,2220.0,2015-10-27,2016-01-04,2015-07-01,2016-06-30,2015-11-30,35 days,57.6,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,3.0,0,7.0,60.97,3,0,0.0,0,1,4.920453,1.0,2,28.8,0.0
3021,13021,31,0,2220.0,2018-05-17,2019-10-31,2019-07-01,2019-12-31,2019-09-30,31 days,392.1,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,160.0,0,7.0,60.97,13,2,0.0,0,0,21.321962,1.0,17,23.064706,0.0


-Missing values, se tiver zero em tudo, fica NaN a 1
-Missing values das categoricas binarias, treinar o modelo que preve