In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings
warnings.filterwarnings('ignore')

df = pd.read_feather('../input/amexfeather/train_data.ftr')

df.info(verbose=True,show_counts=True)


The dataset has 190 fetures out of which certain features are categorical,
['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']

rest are numerical it seems,
we also observe a lot of missing data for certain features, we will need to identify those features and select features basis data availability first then again basisi statistical importance , as we may discover little later through this notebook

In [None]:
import matplotlib.pyplot as plt
import seaborn as sn
plt.figure(figsize=(5,5))

ax = sn.countplot(x="target", data=df)

Plotting class distribution we keep in mind that negative class here is 0 and positive class is 1, we may need to use some technique for balanced learning if the results from our Base Model is not fine. Meanwhile we prepare our dataset for machine learning.

In [None]:
cat_f = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126','D_63','D_64', 'D_66', 'D_68'] 

all_f = list(df.columns)
all_f.remove("customer_ID")
all_f.remove("S_2")
all_f.remove("D_142")

#finding set of numerical features by cosnducting simple set operations
num_f = list(set(all_f) - set(cat_f))

print(num_f[0:5])

In [None]:
df = df[all_f]

#find null data
print(df.isnull().sum())

#drop columns with less than 20 % data
perc = 20.0 # Like N %
min_count =  int(((100-perc)/100)*df.shape[0] + 1)
df = df.dropna( axis=1, 
                thresh=min_count)
df

After removing certain features from the dataframe we clean it further by dropping all rows with NA , we have lost some data but still have a lot of data for model building nad testing.

In [None]:
df=df.dropna()
df=df.reset_index()
df=df.drop("index",axis=1)
df

In [None]:
all_f = list(df.columns)

#finding set of numerical features by cosnducting simple set operations
num_f = list(set(all_f) - set(cat_f))

cat_f = list(set(all_f) - set(num_f))

num_f[0:5]

In [None]:
all_f=list(df.columns)
all_f.remove("target")

In [None]:
encoded_df = pd.get_dummies( df[all_f], 
                                        columns = cat_f,
                                        drop_first = True )

encoded_df

In [None]:
X = encoded_df
Y = df['target']

In [None]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_y, test_y = train_test_split( X,
                                                    Y,
                                                    test_size = 0.3,
                                                    random_state = 42 )

In [None]:
#We need to scale our feature in a standard scale, it's our pre-processor
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
train_X = sc.fit_transform(train_X)
test_X = sc.transform(test_X)

In [None]:
from sklearn.linear_model import LogisticRegression

## Initializing the model
logit = LogisticRegression()
## Fitting the model with X and Y values of the dataset
logit.fit( train_X, train_y)

In [None]:
pred_y = logit.predict(test_X)

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(test_y,pred_y)
print(cm)
accuracy_score(test_y, pred_y)

In [None]:
from sklearn import metrics
print( metrics.classification_report( test_y, pred_y ) )

In [None]:
# grid search solver to find best fit
from sklearn.datasets import make_classification
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold

#define class weight dictionary, negative class has 20x weight
w = {0:20, 1:1}

# define dataset
X, y = make_classification(n_samples=100000, n_features=177, n_redundant=0, random_state=1)
# define model
model = LogisticRegression(random_state=1, class_weight=w)
# define model evaluation method
cv = RepeatedStratifiedKFold(n_splits=7, n_repeats=3, random_state=1)
# define grid
grid = dict()
grid['solver'] = ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga']
# define search
search = GridSearchCV(model, grid, scoring='roc_auc', cv=cv, n_jobs=1)
# perform the search
results = search.fit(X, y)
# summarize
print('Mean Accuracy: %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)

In [None]:
probs1= search.predict_proba(X)[:,1]


from sklearn.metrics import accuracy_score, roc_curve, auc

def evaluate_roc(probs, y_true):
    """
    - Print AUC and accuracy on the test set
    - Plot ROC
  
    """
    preds = probs1
    fpr, tpr, threshold = roc_curve(y_true, preds)
    roc_auc = auc(fpr, tpr)
    print(f'AUC: {roc_auc:.4f}')
       
    # Get accuracy over the test set
    y_pred = np.where(preds >= 0.5, 1, 0)
    accuracy = accuracy_score(y_true, y_pred)
    print(f'Accuracy: {accuracy*100:.2f}%')
    
    # Plot ROC AUC
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()
    
# Evaluate the classifier
evaluate_roc(probs1,y)