In [None]:
#import all libs

import os
import cx_Oracle
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import time
import gc
from IPython.display import set_matplotlib_formats
import getpass
import geopandas as gpd

set_matplotlib_formats('retina')
sns.set_palette('pastel')

In [None]:
#Set up DB connection

os.environ['ORACLE_HOME'] = "oraclepath"
dns_tns = cx_Oracle.makedsn('ip','7777',service_name = 'servicename')
usr = getpass.getpass(prompt='Insert username:\n')
pwd = getpass.getpass(prompt='Insert password:\n')
conn = cx_Oracle.connect(user=usr, password=pwd, dsn=dns_tns, encoding='utf-8')

In [None]:
# load target group

df_taret = pd.read_sql('''''', con=conn, parse_dates = date_columns)
#df_taret = pd.read_csv('')
df_taret.shape

In [None]:
# load control group (where look-alike is to be found)

df_all = pd.read_sql('''''',con=conn, parse_dates = date_columns)
#df_all = pd.read_csv('')
df_all.shape

 # Preprocessing

In [None]:
# utility methods

def change_to_date(x):
    try:
        return pd.to_datetime(x)
    except ValueError:
        return None
    
def get_standard_regions(df,column):
    map_df = gpd.read_file('regions2010_alb_shp/regions2010.shp', encoding='cp1251')
    map_df = map_df[map_df['geometry'].notnull()]

    prefixes = {}
    for every in map_df['region'].unique():
        lower = every.lower()
        tokens = lower.split(' ')
        for word in tokens:
            word = word.replace("(", "")
            word = word.replace(")", "")
            if word[:5] not in ['респу', 'облас', 'округ', 'автон', 'ао', 'город', 'край', '+']:
                prefixes[word[:5]] = every
    prefixes['башк'] = 'Республика Башкирия'
    
    df[column] = df[column].apply(lambda x: x.lower() if pd.notnull(x) else None)
    
    for prefix, full_name in prefixes.items():
        df.at[df[column].str.contains(prefix, na=False), column] = full_name

In [None]:
get_standard_regions(df_target, 'REGION')
get_standard_regions(df_all, 'REGION')

In [None]:
df_all['TARGET']=0
df_target['TARGET']=1

In [None]:
#normalize class balance

final_df = pd.concat([df_all.sample(df_target.shape[0]),df_target],ignore_index=True)
final_df['TARGET'].value_counts(normalize=True)

In [None]:
# delete columns with almost identical values

for column in final_df.columns:
    if final_df[column].value_counts(normalize=True).iloc[0]>0.99:
        final_df=final_df.drop(column,1)
        print(str(column)+' deleted')

In [None]:
# delete columns with more than 30% NaN

for column in final_df.columns:
    if final_df[column].notnull().value_counts(normalize=True).loc[True]<0.70:
        final_df=final_df.drop(column,1)
        print(str(column)+' deleted')

In [None]:
from category_encoders.target_encoder import TargetEncoder
te_region = TargetEncoder()
final_df['REGION'] = te_region.fit_transform(final_df['REGION'],final_df['TARGET'])

In [None]:
def calculate_woe_iv(dataset, feature, target):
    lst = []
    for i in range(dataset[feature].nunique()):
        val = list(dataset[feature].unique())[i]
        lst.append({
            'Value': val,
            'All': dataset[dataset[feature] == val].count()[feature],
            'Good': dataset[(dataset[feature] == val) & (dataset[target] == 0)].count()[feature],
            'Bad': dataset[(dataset[feature] == val) & (dataset[target] == 1)].count()[feature]
        })
        
    dset = pd.DataFrame(lst)
    dset['Distr_Good'] = dset['Good'] / dset['Good'].sum()
    dset['Distr_Bad'] = dset['Bad'] / dset['Bad'].sum()
    dset['WoE'] = np.log(dset['Distr_Good'] / dset['Distr_Bad'])
    dset = dset.replace({'WoE': {np.inf: 0, -np.inf: 0}})
    dset['IV'] = (dset['Distr_Good'] - dset['Distr_Bad']) * dset['WoE']
    iv = dset['IV'].sum()
    
    dset = dset.sort_values(by='WoE')
    
    return dset, iv

In [None]:
for col in final_df.dtypes[final_df.dtypes=='object'].index:
        df, iv = calculate_woe_iv(final_df, col, 'TARGET')
        best_predictors = df[df['IV']>0.02][['Value','IV']]
        if not best_predictors.empty:
            print(col)
            print('Leaving important values: '+ str(best_predictors['Value'].values))
            final_df[col]= final_df[col].apply(lambda x: x if x in best_predictors['Value'].values else 'OTHER')
        else:
            
            print('Dropped useless column:'+col)
            final_df = final_df.drop(col,1)

In [None]:
for column in final_df.dtypes[final_df.dtypes=='object'].index:
    print(column)
    print(final_df[column].nunique())
    final_df = final_df.join(pd.get_dummies(final_df[column],prefix=column+'_'))
    final_df = final_df.drop(column,1)

In [None]:
from transliterate import translit, get_available_language_codes
for every in final_df.columns:
    new_name = str(translit(every,'ru', reversed=True))
    final_df[new_name]=final_df[every]
    if new_name!=every:
        final_df = final_df.drop(every,1)

In [None]:
final_df.shape

# LGBM Feature Selection

In [None]:
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
gc.collect()
X = final_df.drop('TARGE',1)
y = final_df['TARGET']

skf = StratifiedKFold(n_splits=5,shuffle=True, random_state = 42)
scores = []
f_i = np.zeros(X.shape[1])

for train_index, test_index in skf.split(X, y):
    gc.collect()
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    model = lgb.LGBMClassifier(n_jobs=12,random_state = 42, class_weight='balanced')
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    scores.append(roc_auc_score(y_pred,y_test))
    f_i = np.add(f_i,model.feature_importances_)
    gc.collect()

In [None]:
ranking = pd.DataFrame({'Value':f_i,'Feature':X.columns})
ranking = ranking.sort_values(by='Value',ascending=False)

In [None]:
print(scores)

In [None]:
ranking.iloc[:10]

In [None]:
import sklearn.metrics as metrics
p_fpr, p_tpr, _ = metrics.roc_curve(y_test, model.predict_proba(X_test)[:,1])
roc_val = np.mean(scores)
plt.figure(figsize=(5, 5))
plt.title('Receiver Operating Characteristic ALL')
plt.plot(p_fpr, p_tpr, linestyle=':', color='red', label = 'ROC AUC score = %0.2f' % roc_val)
plt.plot([0, 1], [0, 1],'b--')
plt.legend(loc = 'lower right')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True)
n_scores = []
n_list =[50,30,20,10,5]

for best_n in n_list:
    
    cv_scores=[]
    
    best_n_names=ranking['Feature'].iloc[:best_n].values.tolist()  
    X = final_df.drop('TARGET',1)[best_n_names]
    y = final_df['TARGET']
    
    for train_index, test_index in skf.split(X, y):
        gc.collect()
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        model = lgb.LGBMClassifier()
        model.fit(X_train,y_train)
        y_pred = model.predict(X_test)
        cv_scores.append(roc_auc_score(y_pred,y_test))
        gc.collect() 
        
    n_scores.append(np.mean(cv_scores))

In [None]:
plt.figure(figsize=(5,5))
plt.plot(n_list,n_scores)
plt.ylim(0.85,0.95)

In [1]:
selected_df = final_df[ranking.iloc[:100]['Feature'].values]

for column in selected_df.columns:
    selected_df[column] = selected_df[column].fillna(selected_df[column].median())
    
selected_df.shape

NameError: name 'final_df' is not defined

# Correlations

In [None]:
corr_mat = selected_df.corr()

In [None]:
for column in corr_mat:
    print(column)
    print('Correlated:')
    corr_cols=corr_mat[abs(corr_mat[column])>0.4].index.tolist()
    print(corr_cols)
    if len(corr_cols)>1:
        corr_cols.remove(ranking.loc[ranking[ranking['Feature'].isin(corr_cols)]['Value'].idxmax()]['Feature'])
        for every in corr_cols :
            if every !='TARGET':
                try:
                    selected_df = selected_df.drop(every,1)
                except Exception as e:
                        print(e)

In [None]:
selected_df.shape

In [None]:
plt.rcParams['figure.figsize']=(10,10)
sns.heatmap(selected_df.corr(),annot=True)
plt.show()

In [None]:
corr_mat = selected_df.corr(method='spearman')

In [None]:
for column in corr_mat:
    print(column)
    print('Correlated:')
    corr_cols=corr_mat[abs(corr_mat[column])>0.4].index.tolist()
    print(corr_cols)
    if len(corr_cols)>1:
        corr_cols.remove(ranking.loc[ranking[ranking['Feature'].isin(corr_cols)]['Value'].idxmax()]['Feature'])
        for every in corr_cols :
            if every !='TARGET':
                try:
                    selected_df = selected_df.drop(every,1)
                except Exception as e:
                        print(e)

In [None]:
plt.rcParams['figure.figsize']=(10,10)
sns.heatmap(selected_df.corr(method='spearman'),annot=True)
plt.show()

In [None]:
selected_df.shape

# Re-training on selected features & Scoring

In [None]:
model = lgb.LGBMClassifier(n_splits=10,shuffle=True,class_weight='balanced')

model.fit(selected_df,final_df['TARGET'])
y_pred = model.predict_proba(selected_df)

In [None]:
y_pred_train = model.predict(selected_df)
print(roc_auc_score(y_pred_train,y))

In [None]:
final_df['LGBM_SCORE']=y_pred[:,1]

# Prepare for clustering

In [None]:
final_df_log = selected_df.copy()

In [None]:
final_df_log.columns

In [None]:
if 'CLUSTERS' in final_df_log.columns:
    final_df_log = final_df_log.drop('CLUSTERS',1)

In [None]:
from sklearn.preprocessing import RobustScaler, PowerTransformer
scaler = PowerTransformer(method='yeo-johnson')
#scaler = RobustScaler()

In [None]:
final_df_scaled = scaler.fit_transform(final_df_log)

Понижаем размерность 

In [None]:
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D

pca = PCA(0.95)
df_PCA = pca.fit_transform(final_df_scaled)

In [None]:
plt.figure(figsize=(7,7))
plt.scatter(df_PCA[(final_df['TARGET']==1).values,0],df_PCA[(final_df['TARGET']==1).values,1])
plt.scatter(df_PCA[(final_df['TARGET']==0).values,0],df_PCA[(final_df['TARGET']==0).values,1])
plt.show()

In [None]:
fig =  plt.figure(figsize=(7,7))
ax = fig.add_subplot(111,projection='3d')
ax.scatter(df_PCA[(final_df['TARGET']==1),0],df_PCA[(final_df['TARGET']==1),1],df_PCA[(final_df['TARGET']==1),2])
ax.scatter(df_PCA[(final_df['TARGET']==0),0],df_PCA[(final_df['TARGET']==0),1],df_PCA[(final_df['TARGET']==0),2])
plt.show()

In [None]:
%%time
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score
import gc

inertia = []
silhouette = []
for k in range(2,25):
    gc.collect()
    kmeans = MiniBatchKMeans(n_clusters=k,init='k-means++',random_state=22)
    kmeans.partial_fit(df_PCA)
    inertia.append(np.sqrt(kmeans.inertia_))
    silhouette.append(silhouette_score(df_PCA,kmeans.labels_,metric='euclidean',sample_size=100000,random_state=22))
    gc.collect()

In [None]:
plt.figure(figsize=(5,5))
plt.plot(range(2,25),inertia)

In [None]:
plt.figure(figsize=(5,5))
plt.plot(range(2,25),silhouette)

In [None]:
from sklearn.cluster import MiniBatchKMeans
kmeans = MiniBatchKMeans(n_clusters=22,init='k-means++',random_state=22)
y_clusters = kmeans.fit_predict(df_PCA)

In [None]:
final_df['CLUSTERS'] = y_clusters
target_balance = final_df.groupby('CLUSTERS')[['TARGET']].mean()
cluster_balance = final_df[final_df['TARGET']==0]['CLUSTERS'].value_counts(normalize=True)
mean_score = final_df[final_df['TARGET']==0].groupby('CLUSTERS')[['LGBM_SCORE']].mean()
clustering_report = bio_balance.join(cluster_balance).join(mean_score)
clustering_report

In [None]:
plt.figure(figsize=(10,7))
sns.barplot(x='CLUSTERS',y='TARGET',data=final_df)

avg_level = final_df['TARGET'].value_counts(normalize=True).loc[1]
fst = final_df['CLUSTERS'].min()
sec = final_df['CLUSTERS'].max()
sns.lineplot(x=[fst,sec],y=[avg_level,avg_level])