In [34]:
import warnings
warnings.filterwarnings('ignore')

In [35]:
from sqlalchemy import create_engine
import pandas as pd

In [36]:
engine = create_engine('postgresql://postgres:postgres@localhost:5432/postgres')

In [37]:
new_df = pd.read_sql('select * from ccf', engine.connect())

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [None]:
file_path = Path('card_transaction.v1.csv')
new_df = pd.read_csv(file_path)
new_df.head()

In [None]:
new_df.columns = new_df.columns.str.replace(' ', '')
new_df

In [None]:
new_df['IsFraud?'].value_counts()

In [None]:
new_df.isna().sum()

In [None]:
new_df

In [None]:
new_df.columns

In [None]:
list(new_df.describe())

In [None]:
new_df['Amount'] = pd.to_numeric(new_df['Amount'].str.replace('$', ''))
new_df

In [None]:
cols_to_keep = ['Amount', 'Card', 'Year', 'Month', 'Day', 'MCC', 'IsFraud?']

In [None]:
new_df['UseChip'].value_counts()

In [None]:
new_df.to_csv('data1.csv')

In [None]:
usechip_dummies_df = pd.get_dummies(new_df['UseChip'])

In [None]:
# state_dummies_df = pd.get_dummies(new_df['MerchantState'])
# state_dummies_df

In [None]:
model_df = new_df[cols_to_keep]
model_df

In [None]:
usechip_dummies_df.shape

In [None]:
new_df.shape

In [None]:
# the model that is ready for XGBoost and resampling
model_df = pd.concat([model_df, usechip_dummies_df], axis = 1)
model_df

In [None]:
model_df['IsFraud?'].value_counts()

In [None]:
#Looking for the amount of "yes" and 'no' fraudulent transactions for each year
for year in new_df['Year'].unique():
    print(year, new_df[new_df['Year']==year]['IsFraud?'].value_counts())

## Target Variable Class Distribution

In [None]:
import matplotlib.pyplot as plt

In [None]:
ax = model_df['IsFraud?'].value_counts().plot(kind = "bar", figsize = (10, 6), fontsize=13, color="#192bb0")
ax.set_title("Credit Card Fraud (No = No Fraud, Yes = Fraud)", size=20, pad=30)
ax.set_ylabel("Number of Transactions", fontsize=14)

for i in ax.patches:
    ax.text(i.get_x() + 0.19, i.get_height() + 700, str(round(i.get_height(), 2)), fontsize=15)


In [None]:
# Change the stringd column
def change_string(Fraud):
    if Fraud == 'Yes':
        return 1
    else:
        return 0

model_df["IsFraud?"] = model_df["IsFraud?"].apply(change_string)
model_df.head()

In [None]:
#looking for how many different card values there are (What do the values mean?)
for card in model_df['Card'].unique():
    print(card, model_df[model_df['Card']==card]['Card'].value_counts())

In [None]:
X = model_df.copy()
X = X.drop("IsFraud?", axis=1)
y = model_df["IsFraud?"].values

In [None]:
y

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X_train, X_test, y_train, y_test = train_test_split(X,
   y)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
Counter(y_train)

In [None]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

In [None]:
Counter(y_resampled)

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import balanced_accuracy_score
from imblearn.metrics import classification_report_imbalanced

In [None]:
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
learning_rates = [0.75]


for learning_rate in learning_rates:
    my_classifier = GradientBoostingClassifier(n_estimators=10,
                                               learning_rate=learning_rate,
                                               max_features=5,
                                               max_depth=3,
                                               random_state=0)
    classifier.fit(X_train_scaled, y_train)
    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(classifier.score(X_train_scaled,
                                                                       y_train)))
    print("Accuracy score (validation): {0:.3f}".format(classifier.score(X_test_scaled, y_test)))

## Data Scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler

# We are scaling only colums that have a value greater than 1
to_scale = [col for col in model_df.columns if model_df[col].max() >1]
mms = MinMaxScaler()
scaled = mms.fit_transform(model_df[to_scale])
scaled = pd.DataFrame(scaled, columns=to_scale)

#replace original colums with scaled ones
for col in scaled:
    model_df[col] = scaled[col]
    
model_df.head()

In [None]:
from sklearn.model_selection import train_test_split

X = model_df.drop("IsFraud?", axis=1)
y = model_df["IsFraud?"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42)

print (f'''% Positive class in Train = {np.round(y_train.value_counts(normalize=True)[1]*100,2)}
% Positive Class in Test = {np.round(y_test.value_counts(normalize=True)[1]*100, 2)}''')

In [33]:
# Import dependecies needed
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix

#Train 
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
preds = model.predict(X_test)

#Evaluate
print(f'Accuracy = {accuracy_score(y_test, preds):.2f}\nRecall = {recall_score(y_test, preds):.2f}\n')
cm = confussion_matrix(y_test, preds)
plt.figure(figsize=(8,6))
plt.title('Confusion Matrix (without SMOTE)', size =16)
sns.heatmap(cm, annot=True, cmap="Blues")



KeyboardInterrupt: 

In [None]:
pip install imbalanced-learn

In [None]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42)

X_sm, y_sm = sm.fit_resample(X,y)

print(f'''Shape of X before SMOTE: {X.shape}
Shape of X after Smote: {X_sm.shape}''')

print('\nBalance of positive and negative classes (%):')
y_sm.value_counts(normalize=True)*100

In [None]:
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

confusion_matrix(y_test, y_pred)

print(classification_report_imbalanced(y_test, y_pred))

In [None]:
# Import dependecies needed
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix

X_train, X_test, y_train, y_test = train_test_split(
    X_sm, y_sm, test_size=0.25, random_state=42)

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
preds = model.predict(X_test)

print(f'Accuracy = {accuracy_score(y_test, preds):.2f}\nRecall = {recall_score(y_test, preds):.2f}\n')
cm = confussion_matrix(y_test, preds)
plt.figure(figsize=(8,6))
plt.title('Confusion Matrix (with SMOTE)', size =16)
sns.heatmap(cm, annot=True, cmap="Blues")

In [None]:
pd.model_df.to_csv('data2')