In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
%matplotlib inline

In [2]:
train_df = pd.read_csv('/Users/remir/Desktop/DATA-ANALYSYS_2-master/Dataset/credit_scoring_train.csv', index_col='client_id')
test_df = pd.read_csv('/Users/remir/Desktop/DATA-ANALYSYS_2-master/Dataset/credit_scoring_test.csv', index_col='client_id')

In [3]:
## Подготовка данных 

In [4]:
y = train_df['Delinquent90']
train_df.drop('Delinquent90', axis=1, inplace=True)
train_df['NumDependents'].fillna(train_df['NumDependents'].median(), inplace=True)
train_df['Income'].fillna(train_df['Income'].median(), inplace=True)
test_df['NumDependents'].fillna(test_df['NumDependents'].median(), inplace=True)
test_df['Income'].fillna(test_df['Income'].median(), inplace=True)

In [5]:
## Дерево решений без настройки параметров

In [6]:
first_tree = DecisionTreeClassifier(max_depth = 3, random_state = 17)
first_tree.fit(train_df, y)
first_tree_pred = first_tree.predict(test_df)


In [7]:
#Функция записи прогноза в файл
def write_to_submission_file(predicted_labels, out_file,
                             target='Delinquent90', index_label="client_id"):
    # turn predictions into data frame and save as csv file
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(75000, 
                                                  predicted_labels.shape[0] + 75000),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [8]:
write_to_submission_file(first_tree_pred, 'credit_scoring_first_tree.csv')

In [9]:
##Дерево решений с настройкой параметров с помощью GridSearch

In [10]:
tree_params = {'max_depth': list(range(3, 8)), 
               'min_samples_leaf': list(range(5, 13))}

locally_best_tree = GridSearchCV(DecisionTreeClassifier(random_state=17), tree_params, cv=5, n_jobs=-1)
locally_best_tree.fit(train_df, y)
locally_best_tree.best_params_, round(locally_best_tree.best_score_, 3)
tuned_tree_pred_probs = locally_best_tree.predict(test_df)
write_to_submission_file(tuned_tree_pred_probs, 'tuned_tree_pred_probs.csv')

In [11]:
##Случайный лес без настройки параметров

In [12]:
first_forest = RandomForestClassifier(random_state=17)
first_forest.fit(train_df, y)
first_forest_pred = first_forest.predict(train_df)
write_to_submission_file(first_forest_pred, 'first_forest_pred.csv')



In [13]:
##Случайный лес c настройкой параметров¶

In [14]:

%%time
forest_params = {'max_features': np.linspace(.3, 1, 7)}

locally_best_forest = GridSearchCV(DecisionTreeClassifier(random_state=17), tree_params, cv=5, n_jobs=-1)
locally_best_forest.fit(train_df, y)
locally_best_forest.best_params_, round(locally_best_forest.best_score_, 3)
tuned_forest_pred = locally_best_forest.predict(train_df)
write_to_submission_file(tuned_forest_pred, 'tuned_forest_pred.csv')

CPU times: user 1.61 s, sys: 243 ms, total: 1.85 s
Wall time: 16.9 s


In [15]:
#Настроенный случайный лес оценивает важность признаков, увеличим кол-вщ деревьев для улучшения результата

In [16]:
pd.DataFrame(locally_best_forest.best_estimator_.feature_importances_, test_df.columns, columns=['value'])

Unnamed: 0,value
DIR,0.005254
Age,0.017305
NumLoans,0.003545
NumRealEstateLoans,0.003867
NumDependents,0.0
Num30-59Delinquencies,0.166844
Num60-89Delinquencies,0.456897
Income,0.008711
BalanceToCreditLimit,0.337577


In [17]:
%%time
final_forest = RandomForestClassifier(n_estimators=300, random_state=17)
final_forest.fit(train_df, y)
final_forest_pred = final_forest.predict_proba(test_df)[:, 1]
write_to_submission_file(final_forest_pred, 'credit_scoring_final_forest.csv')

CPU times: user 50.6 s, sys: 345 ms, total: 50.9 s
Wall time: 51.4 s
