In [1]:
import utils
import plots
import pandas as pd
import numpy as np

%matplotlib inline

training = utils.load_training_set()



We start by separating the Income, and Number of Dependents into different set of binary classes.



In [2]:
training['UnknownNumberOfDependents'] = (training.NumberOfDependents.isnull() * 1)
training['UnknownMonthlyIncome'] = (training.MonthlyIncome.isnull() * 1)

training['NoDependents'] = training.NumberOfDependents[training.NumberOfDependents == 0]
training.loc[training.NumberOfDependents.isnull(), 'NoDependents'] = 0

training.loc[training.UnknownNumberOfDependents == 1, 'NumberOfDependents'] = 0

training.NoIncome = training.MonthlyIncome
training.loc[training.MonthlyIncome != 0, 'NoIncome'] = 0
training.loc[training.MonthlyIncome == 0, 'NoIncome'] = 1
training.NoIncome.fillna(0)

training.loc[training.UnknownMonthlyIncome == 1, 'MonthlyIncome'] = 0

print('Incomes in No Income %s' % training.NoIncome[training.NoIncome == 1].count())
print('Amount of No Dependents %s' % training.query('NoDependents == 0').NoDependents.count())

Incomes in No Income 605
Amount of No Dependents 90826


We find the ratios of the income and debt and fill them in their column

In [3]:
training['ZeroDebtRatio'] = training.DebtRatio
training.loc[training.DebtRatio == 0, 'ZeroDebtRatio'] = 1
training.loc[training.DebtRatio != 0, 'ZeroDebtRatio'] = 0

print('Zero Debt Ratios %s' % training.ZeroDebtRatio[training.ZeroDebtRatio == 1].count())

training['UnknownIncomeDebtRatio'] = training.DebtRatio
training.loc[training.UnknownMonthlyIncome == 0, 'UnknownIncomeDebtRatio'] = 0
training.loc[training.UnknownMonthlyIncome == 1, 'DebtRatio'] = 0

print('UnknownIncomeDebtRatio %s' \
      % training.UnknownIncomeDebtRatio[training.UnknownIncomeDebtRatio == 0].count())

Zero Debt Ratios 4113
UnknownIncomeDebtRatio 121891


RevolvingUtilizationOfUnsecuredLines is a percentage type for:

> Total balance on credit cards and personal lines of credit except real estate and no installment debt like car loans divided by the sum of credit limits

We query a Log to find weird behaviors in this column and set them to 0, in both a different column, and the old one.

In [4]:
training['WeirdRevolvingUtilization'] = training.RevolvingUtilizationOfUnsecuredLines
training.loc[np.log1p(training.RevolvingUtilizationOfUnsecuredLines) > 3, 'WeirdRevolvingUtilization'] = 0
  
training.loc[training.RevolvingUtilizationOfUnsecuredLines == 0, 'ZeroRevolvingUtilization'] = 0
training.loc[np.log1p(training.RevolvingUtilizationOfUnsecuredLines) > 3, \
             'RevolvingUtilizationOfUnsecuredLines'] = 0

print('Amount of WeirdRevolvingUtilization %s' % 
      training.query('WeirdRevolvingUtilization == 0').WeirdRevolvingUtilization.count())

Amount of WeirdRevolvingUtilization 11112


We generate a Log value for the Debt cross referenced with the Debt Ratio

In [5]:
training['LogDebt'] = np.log1p(np.maximum(training.MonthlyIncome, np.ones(len(training.MonthlyIncome))) \
                             * training.DebtRatio)
training.loc[np.isinf(training.LogDebt), 'LogDebt'] = 0

training.query('LogDebt > 0').LogDebt.head()

0    8.898939
1    5.761668
2    5.560354
3    4.787191
4    7.368955
Name: LogDebt, dtype: float64

We calculate the amount of open credits and real state loans to create e Revolving Lines Column, and three binary columns to check if the user Has any of them.

In [6]:
training['RevolvingLines'] = training.NumberOfOpenCreditLinesAndLoans - training.NumberRealEstateLoansOrLines

training['HasRevolvingLines'] = training.RevolvingLines[training.RevolvingLines > 0].astype(int)
training['HasRealEstateLoans'] = training.NumberRealEstateLoansOrLines[training\
                                                                    .NumberRealEstateLoansOrLines > 0].astype(int)
training['HasMultipleRealEstateLoans'] = training.NumberRealEstateLoansOrLines\
                                            [training.NumberRealEstateLoansOrLines > 2].astype(int)
    
print('Number of Revolving Lines % s' % training.query('HasRevolvingLines == 1').HasRevolvingLines.count())
print('Number of Real Estate Loans %s ' % training.query('HasRealEstateLoans == 1').HasRealEstateLoans.count())
print('Number of Multiple Real Estate Loans % s' %  
      training.query('HasMultipleRealEstateLoans == 1').HasMultipleRealEstateLoans.count())

Number of Revolving Lines 5783
Number of Real Estate Loans 52338 
Number of Multiple Real Estate Loans 0


We separate the DTI (Debt to income ratio) from over 43, and 33, and also the person eligibility to social securty, and the amount of disposable income.

And also cross reference the real estate to the resolving lines

In [7]:
training['EligibleSS'] = training.age[training.age > 60].astype(int)
training['DTIOver33'] = training.NoIncome[(training.NoIncome == 0) & (training.DebtRatio > 0.33)].astype(int)
training['DTIOver43'] = training.NoIncome[(training.NoIncome == 0) & (training.DebtRatio > 0.43)].astype(int)
training['DisposableIncome'] = (1 - training.DebtRatio) * training.MonthlyIncome
training.loc[training.NoIncome == 1, 'DisposableIncome'] = 0

training['RevolvingToRealEstate'] = training.RevolvingLines / (1 + training.NumberRealEstateLoansOrLines)

There are some really worse for the number of times to 3059 days and 6089, the numbers above 90/98/96.

We separate those in differente binary columns

In [8]:
training['NumberOfTime3059DaysPastDueNotWorseLarge'] = training.NumberOfTime3059DaysPastDueNotWorse\
                                            [training.NumberOfTime3059DaysPastDueNotWorse > 90].astype(int)
training['NumberOfTime3059DaysPastDueNotWorse96'] = training.NumberOfTime3059DaysPastDueNotWorse\
                                                [training.NumberOfTime3059DaysPastDueNotWorse == 96].astype(int)
training['NumberOfTime3059DaysPastDueNotWorse98'] = training.NumberOfTime3059DaysPastDueNotWorse\
                                                [training.NumberOfTime3059DaysPastDueNotWorse == 98].astype(int)
training['Never3059DaysPastDueNotWorse'] = training.NumberOfTime3059DaysPastDueNotWorse\
                                                [training.NumberOfTime3059DaysPastDueNotWorse == 0].astype(int)
training.loc[training.NumberOfTime3059DaysPastDueNotWorse > 90, 'NumberOfTime3059DaysPastDueNotWorse'] = 0

In [9]:
training['NumberOfTime6089DaysPastDueNotWorseLarge'] = training.NumberOfTime6089DaysPastDueNotWorse\
                                            [training.NumberOfTime6089DaysPastDueNotWorse > 90].astype(int)

training['NumberOfTime6089DaysPastDueNotWorse96'] = training.NumberOfTime6089DaysPastDueNotWorse\
                                            [training.NumberOfTime6089DaysPastDueNotWorse == 96].astype(int)

training['NumberOfTime6089DaysPastDueNotWorse98'] = training.NumberOfTime6089DaysPastDueNotWorse\
                                            [training.NumberOfTime6089DaysPastDueNotWorse == 98].astype(int)

training['Never6089DaysPastDueNotWorse'] = training.NumberOfTime6089DaysPastDueNotWorse\
                                            [training.NumberOfTime6089DaysPastDueNotWorse == 0].astype(int)
    
training.loc[training.NumberOfTime6089DaysPastDueNotWorse > 90, 'NumberOfTime6089DaysPastDueNotWorse'] = 0

The same for the number of days late

In [10]:
training['NumberOfTimes90DaysLateLarge'] = training.NumberOfTimes90DaysLate\
                                            [training.NumberOfTimes90DaysLate > 90].astype(int)
training['NumberOfTimes90DaysLate96'] = training.NumberOfTimes90DaysLate\
                                            [training.NumberOfTimes90DaysLate == 96].astype(int)
training['NumberOfTimes90DaysLate98'] = training.NumberOfTimes90DaysLate\
                                            [training.NumberOfTimes90DaysLate == 98].astype(int)
training['Never90DaysLate'] = training.NumberOfTimes90DaysLate\
                                            [training.NumberOfTimes90DaysLate == 0].astype(int)
    
training.loc[training.NumberOfTimes90DaysLate > 90, 'NumberOfTimes90DaysLate'] = 0

In [11]:
training['IncomeDivBy10'] = training.MonthlyIncome[training.MonthlyIncome % 10 == 0].astype(int)
training['IncomeDivBy100'] = training.MonthlyIncome[training.MonthlyIncome % 100 == 0].astype(int)
training['IncomeDivBy1000'] = training.MonthlyIncome[training.MonthlyIncome % 1000 == 0].astype(int)
training['IncomeDivBy5000'] = training.MonthlyIncome[training.MonthlyIncome % 5000 == 0].astype(int)

In [12]:
training['Weird0999Utilization'] = training.RevolvingUtilizationOfUnsecuredLines\
                                    [training.RevolvingUtilizationOfUnsecuredLines == 0.9999999].astype(int)

training['FullUtilization'] = training.RevolvingUtilizationOfUnsecuredLines\
                                    [training.RevolvingUtilizationOfUnsecuredLines == 1].astype(int)

training['ExcessUtilization'] = training.RevolvingUtilizationOfUnsecuredLines\
                                    [training.RevolvingUtilizationOfUnsecuredLines > 1].astype(int)

In [13]:
training['NumberOfTime3089DaysPastDueNotWorse'] = (training.NumberOfTime3059DaysPastDueNotWorse +
                                                   training.NumberOfTime6089DaysPastDueNotWorse)
training['Never3089DaysPastDueNotWorse'] = (training.Never6089DaysPastDueNotWorse *
                                            training.Never3059DaysPastDueNotWorse)

In [14]:
training['NumberOfTimesPastDue'] = (training.NumberOfTime3059DaysPastDueNotWorse + 
                                    training.NumberOfTime6089DaysPastDueNotWorse +
                                    training.NumberOfTimes90DaysLate)
training['NeverPastDue'] = (training.Never90DaysLate * 
                            training.Never6089DaysPastDueNotWorse * 
                            training.Never3059DaysPastDueNotWorse)

training['LogRevolvingUtilizationTimesLines'] = np.log1p(training.RevolvingLines * 
                                                       training.RevolvingUtilizationOfUnsecuredLines)

In [15]:
training['LogRevolvingUtilizationOfUnsecuredLines'] = np.log(training.RevolvingUtilizationOfUnsecuredLines)
training.LogRevolvingUtilizationOfUnsecuredLines = training\
                                            .LogRevolvingUtilizationOfUnsecuredLines\
                                            .replace([np.inf], np.NAN)

training.LogRevolvingUtilizationOfUnsecuredLines.fillna(0)

training = training.drop(['RevolvingUtilizationOfUnsecuredLines'], axis=1)

  if __name__ == '__main__':


In [16]:
training['DelinquenciesPerLine'] = training.NumberOfTimesPastDue / training.NumberOfOpenCreditLinesAndLoans
training.loc[(training.NumberOfOpenCreditLinesAndLoans == 0), 'DelinquenciesPerLine'] = 0

training['MajorDelinquenciesPerLine'] = training.NumberOfTimes90DaysLate / training.NumberOfOpenCreditLinesAndLoans
training.loc[(training.NumberOfOpenCreditLinesAndLoans == 0), 'MajorDelinquenciesPerLine'] = 0

training['MinorDelinquenciesPerLine'] = (training.NumberOfTime3089DaysPastDueNotWorse / 
                                        training.NumberOfOpenCreditLinesAndLoans)
training.loc[(training.NumberOfOpenCreditLinesAndLoans == 0), 'MinorDelinquenciesPerLine'] = 0

In [17]:
training['DelinquenciesPerRevolvingLine'] = training.NumberOfTimesPastDue / training.RevolvingLines
training.loc[training.DelinquenciesPerRevolvingLine == 0, 'RevolvingLines'] = 0

training['MajorDelinquenciesPerRevolvingLine'] = training.NumberOfTimes90DaysLate / training.RevolvingLines
training.loc[training.RevolvingLines == 0, 'MajorDelinquenciesPerRevolvingLine'] = 0

training['MinorDelinquenciesPerRevolvingLine'] = (training.NumberOfTime3089DaysPastDueNotWorse / 
                                                  training.RevolvingLines)
training.loc[training.RevolvingLines == 0, 'MinorDelinquenciesPerRevolvingLine'] = 0

In [18]:
training['LogDebtPerLine'] = training.LogDebt - np.log1p(training.NumberRealEstateLoansOrLines)
training['LogDebtPerRealEstateLine'] = training.LogDebt - np.log1p(training.NumberRealEstateLoansOrLines)
training['LogDebtPerPerson'] = training.LogDebt - np.log1p(training.NumberOfDependents)

training['RevolvingLinesPerPerson'] = training.RevolvingLines / (1+ training.NumberOfDependents)

training['RealEstateLoansPerPerson'] = training.NumberRealEstateLoansOrLines / (1 + training.NumberOfDependents)

training['UnknownNumberOfDependents'] = training.UnknownNumberOfDependents.astype(int)

training['YearsOfAgePerDependent'] = training.age / (1 + training.NumberOfDependents)

In [19]:
training['LogMonthlyIncome'] = np.log(training.MonthlyIncome)

training.LogMonthlyIncome = training.LogMonthlyIncome.replace([np.inf], np.NAN)
training.LogMonthlyIncome.fillna(0)

training = training.drop(['MonthlyIncome'], axis=1)

training['LogIncomePerPerson'] = training.LogMonthlyIncome - np.log1p(training.NumberOfDependents)
training['LogIncomeAge'] = training['LogMonthlyIncome'] - np.log1p(training.age)

  if __name__ == '__main__':


In [20]:
training['LogNumberOfTimesPastDue'] = np.log(training.NumberOfTimesPastDue)
training.LogNumberOfTimesPastDue = training.LogNumberOfTimesPastDue.replace([np.inf, -np.inf], 0)

  if __name__ == '__main__':


In [21]:
training['LogNumberOfTimes90DaysLate'] = np.log(training.NumberOfTimesPastDue)
training.LogNumberOfTimes90DaysLate = training.LogNumberOfTimes90DaysLate.replace([np.inf, -np.inf], 0)

  if __name__ == '__main__':


In [22]:
training['LogNumberOfTime3059DaysPastDueNotWorse'] = np.log(training.NumberOfTime3059DaysPastDueNotWorse)
training.NumberOfTime3059DaysPastDueNotWorse = training.NumberOfTime3059DaysPastDueNotWorse\
                                                .replace([np.inf, -np.inf], 0)

  if __name__ == '__main__':


In [23]:
training['LogNumberOfTime6089DaysPastDueNotWorse'] = np.log(training.NumberOfTime6089DaysPastDueNotWorse)
training.LogNumberOfTime6089DaysPastDueNotWorse = training.LogNumberOfTime6089DaysPastDueNotWorse\
                                                .replace([np.inf, -np.inf], 0)

  if __name__ == '__main__':


In [24]:
training['LogRatio90to3059DaysLate'] = (training.LogNumberOfTimes90DaysLate - 
                                        training.NumberOfTime3059DaysPastDueNotWorse)

training['LogRatio90to6089DaysLate'] = (training.LogNumberOfTimes90DaysLate - 
                                        training.LogNumberOfTime6089DaysPastDueNotWorse)

In [25]:
training['AnyOpenCreditLinesOrLoans'] = training.NumberOfOpenCreditLinesAndLoans\
                                        [training.NumberOfOpenCreditLinesAndLoans > 0].astype(int)
    
training['LogNumberOfOpenCreditLinesAndLoans'] = np.log(training.NumberOfOpenCreditLinesAndLoans)

training.NumberOfOpenCreditLinesAndLoans = training.NumberOfOpenCreditLinesAndLoans\
                                                .replace([np.inf], 0)

training['LogNumberOfOpenCreditLinesAndLoansPerPerson'] = (training.LogNumberOfOpenCreditLinesAndLoans - 
                                                          np.log1p(training.NumberOfDependents))

  app.launch_new_instance()


In [26]:
training['HasDependents'] = training.NumberOfDependents[training.NumberOfDependents > 0].astype(int)

training['LogHouseholdSize'] = np.log1p(training.NumberOfDependents)

training = training.drop(['NumberOfDependents'], axis=1)

In [27]:
training['LogDebtRatio'] = np.log(training.DebtRatio)
training.LogDebtRatio = training.LogDebtRatio.replace([np.inf], 0)
training = training.drop(['DebtRatio'], axis=1)

  if __name__ == '__main__':


In [28]:
training['LogDebtPerDelinquency'] = training.LogDebt - np.log1p(training.NumberOfTimesPastDue)
training['LogDebtPer90DaysLate'] = training.LogDebt - np.log1p(training.NumberOfTimes90DaysLate)

In [29]:
training['LogUnknownIncomeDebtRatio'] = np.log(training.UnknownIncomeDebtRatio)
training.LogUnknownIncomeDebtRatio = training.LogUnknownIncomeDebtRatio.replace([np.inf], 0)

training['LogUnknownIncomeDebtRatioPerPerson'] = training.LogUnknownIncomeDebtRatio - training.LogHouseholdSize
training['LogUnknownIncomeDebtRatioPerLine'] = (training.LogUnknownIncomeDebtRatio - 
                                            np.log1p(training.NumberOfOpenCreditLinesAndLoans))

training['LogUnknownIncomeDebtRatioPerRealEstateLine'] = (training['LogUnknownIncomeDebtRatio'] - 
                                                        np.log1p(training.NumberOfTimesPastDue))

training['LogUnknownIncomeDebtRatioPerDelinquency'] = (training.LogUnknownIncomeDebtRatio - 
                                                    np.log1p(training.NumberOfTimesPastDue))

training['LogUnknownIncomeDebtRatioPer90DaysLate'] = (training.LogUnknownIncomeDebtRatio - 
                                                      np.log1p(training.NumberOfTimes90DaysLate))

  if __name__ == '__main__':


In [30]:
training['LogNumberRealEstateLoansOrLines'] = np.log(training.NumberRealEstateLoansOrLines)
training.LogNumberRealEstateLoansOrLines = training.LogNumberRealEstateLoansOrLines.replace([np.inf], 0)

training = training.drop(['NumberRealEstateLoansOrLines'], axis=1)

  if __name__ == '__main__':


In [31]:
training = training.drop(['NumberOfOpenCreditLinesAndLoans',
                         'NumberOfTimesPastDue',
                         'NumberOfTimes90DaysLate',
                         'NumberOfTime3059DaysPastDueNotWorse',
                         'NumberOfTime6089DaysPastDueNotWorse'], axis=1)

In [32]:
training['LowAge'] = training.age[training.age < 18].astype(int)
training['Logage'] = np.log(training.age - 17)
training.loc[training.LowAge == 1, 'Logage'] = 0

training = training.drop(['age'], axis=1)

training.head(10)



Unnamed: 0,SeriousDlqin2yrs,UnknownNumberOfDependents,UnknownMonthlyIncome,NoDependents,NoIncome,ZeroDebtRatio,UnknownIncomeDebtRatio,WeirdRevolvingUtilization,ZeroRevolvingUtilization,LogDebt,...,LogDebtPer90DaysLate,LogUnknownIncomeDebtRatio,LogUnknownIncomeDebtRatioPerPerson,LogUnknownIncomeDebtRatioPerLine,LogUnknownIncomeDebtRatioPerRealEstateLine,LogUnknownIncomeDebtRatioPerDelinquency,LogUnknownIncomeDebtRatioPer90DaysLate,LogNumberRealEstateLoansOrLines,LowAge,Logage
0,1,0,0,,0.0,0.0,0.0,0.766127,,8.898939,...,8.898939,-inf,-inf,-inf,-inf,-inf,-inf,1.791759,,3.332205
1,0,0,0,,0.0,0.0,0.0,0.957151,,5.761668,...,5.761668,-inf,-inf,-inf,-inf,-inf,-inf,-inf,,3.135494
2,0,0,0,0.0,0.0,0.0,0.0,0.65818,,5.560354,...,4.867207,-inf,-inf,-inf,-inf,-inf,-inf,-inf,,3.044522
3,0,0,0,0.0,0.0,0.0,0.0,0.23381,,4.787191,...,4.787191,-inf,-inf,-inf,-inf,-inf,-inf,-inf,,2.564949
4,0,0,0,0.0,0.0,0.0,0.0,0.907239,,7.368955,...,7.368955,-inf,-inf,-inf,-inf,-inf,-inf,0.0,,3.465736
5,0,0,0,,0.0,0.0,0.0,0.213179,,7.182067,...,7.182067,-inf,-inf,-inf,-inf,-inf,-inf,0.0,,4.043051
6,0,0,1,0.0,0.0,0.0,5710.0,0.305682,,0.0,...,0.0,8.649974,8.649974,6.45275,8.649974,8.649974,8.649974,1.098612,,3.688879
7,0,0,0,0.0,0.0,0.0,0.0,0.754464,,6.600945,...,6.600945,-inf,-inf,-inf,-inf,-inf,-inf,-inf,,3.091042
8,0,1,1,0.0,0.0,0.0,46.0,0.116951,,0.0,...,0.0,3.828641,3.828641,2.730029,3.828641,3.828641,3.828641,-inf,,2.302585
9,0,0,0,,0.0,0.0,0.0,0.189169,,9.572229,...,9.572229,-inf,-inf,-inf,-inf,-inf,-inf,1.386294,,3.688879


In [33]:
columns = training.columns.tolist()

columns

['SeriousDlqin2yrs',
 'UnknownNumberOfDependents',
 'UnknownMonthlyIncome',
 'NoDependents',
 'NoIncome',
 'ZeroDebtRatio',
 'UnknownIncomeDebtRatio',
 'WeirdRevolvingUtilization',
 'ZeroRevolvingUtilization',
 'LogDebt',
 'RevolvingLines',
 'HasRevolvingLines',
 'HasRealEstateLoans',
 'HasMultipleRealEstateLoans',
 'EligibleSS',
 'DTIOver33',
 'DTIOver43',
 'DisposableIncome',
 'RevolvingToRealEstate',
 'NumberOfTime3059DaysPastDueNotWorseLarge',
 'NumberOfTime3059DaysPastDueNotWorse96',
 'NumberOfTime3059DaysPastDueNotWorse98',
 'Never3059DaysPastDueNotWorse',
 'NumberOfTime6089DaysPastDueNotWorseLarge',
 'NumberOfTime6089DaysPastDueNotWorse96',
 'NumberOfTime6089DaysPastDueNotWorse98',
 'Never6089DaysPastDueNotWorse',
 'NumberOfTimes90DaysLateLarge',
 'NumberOfTimes90DaysLate96',
 'NumberOfTimes90DaysLate98',
 'Never90DaysLate',
 'IncomeDivBy10',
 'IncomeDivBy100',
 'IncomeDivBy1000',
 'IncomeDivBy5000',
 'Weird0999Utilization',
 'FullUtilization',
 'ExcessUtilization',
 'NumberOfTi

In [34]:
# load the created from the R script
import utils

columns_r = utils.load_features_set().columns.tolist()

columns_r

['SeriousDlqin2yrs',
 'UnknownNumberOfDependents',
 'UnknownMonthlyIncome',
 'NoDependents',
 'NoIncome',
 'ZeroDebtRatio',
 'UnknownIncomeDebtRatio',
 'WeirdRevolvingUtilization',
 'ZeroRevolvingUtilization',
 'LogDebt',
 'RevolvingLines',
 'HasRevolvingLines',
 'HasRealEstateLoans',
 'HasMultipleRealEstateLoans',
 'EligibleSS',
 'DTIOver33',
 'DTIOver43',
 'DisposableIncome',
 'RevolvingToRealEstate',
 'NumberOfTime3059DaysPastDueNotWorseLarge',
 'NumberOfTime3059DaysPastDueNotWorse96',
 'NumberOfTime3059DaysPastDueNotWorse98',
 'Never3059DaysPastDueNotWorse',
 'NumberOfTime6089DaysPastDueNotWorseLarge',
 'NumberOfTime6089DaysPastDueNotWorse96',
 'NumberOfTime6089DaysPastDueNotWorse98',
 'Never6089DaysPastDueNotWorse',
 'NumberOfTimes90DaysLateLarge',
 'NumberOfTimes90DaysLate96',
 'NumberOfTimes90DaysLate98',
 'Never90DaysLate',
 'IncomeDivBy10',
 'IncomeDivBy100',
 'IncomeDivBy1000',
 'IncomeDivBy5000',
 'Weird0999Utilization',
 'FullUtilization',
 'ExcessUtilization',
 'NumberOfTi

In [35]:
list(set(columns_r) - set(columns))

[]

In [None]:
training.to_csv(utils.os.path.join(utils.DIR, 'dataset', 'cs-training-features.csv'),
                      index=False)