# TMP

# Repository: idealista/mysql_role 

In [None]:
# If you want to ignore warings
import warnings
warnings.filterwarnings('ignore')

## Load and prepare data

In [None]:
import pandas as pd
data = pd.read_csv('../input/ansibledefectsprediction/ansible.csv')
data = data[data.repository == 'idealista/mysql_role'].fillna(0)

# Create column to group files belonging to the same release (identified by the commit hash)
data['group'] = data.commit.astype('category').cat.rename_categories(range(1, data.commit.nunique()+1))

# Sort data from the oldest to the newest release
data.sort_values(by=['committed_at'], ascending=True)
data = data.reset_index(drop=True)

# Remove metadata columns but not 'group'
data = data.drop(['commit', 'committed_at', 'filepath', 'repository'], axis=1)

## Train

In [None]:
import joblib
import os
from rq1_base import train

X, y = data.drop(['failure_prone'], axis=1), data.failure_prone.values.ravel()

csv_df = pd.DataFrame()
csv_df.to_csv('./performance.csv', index=False)

for method in ['naive_bayes', 'logistic', 'svc', 'decision_tree', 'random_forest']:
    print(f'Training a {method} classifier...')
    model = train(X, y, method)

    performance = pd.DataFrame(model['cv_results']).iloc[[model['best_index']]] # Take only the scores at the best index
    performance['method'] = method
    
    # Dump performance
    csv_df = csv_df.append(performance, ignore_index=True)
    csv_df.to_csv('./performance.csv', index=False)
        
    # Dump model
    joblib.dump(model, f'./{method}.joblib')

print('Done')
