In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
train_data = pd.read_csv("Datasets/Consumer_Complaints_train.csv")
test_data = pd.read_csv("Datasets/Consumer_Complaints_test.csv")

In [3]:
train_data.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,2015-10-14,Credit reporting,,Incorrect information on credit report,Information is not mine,,,Equifax,GA,30134,,Consent not provided,Web,2015-10-14,Closed with explanation,Yes,No,1605653
1,2015-04-26,Bank account or service,Other bank product/service,Deposits and withdrawals,,RE : XXXX XXXX XXXX-PRIVILEGED AND CONFIDENTIA...,,Wells Fargo & Company,GA,319XX,,Consent provided,Web,2015-04-26,Closed with explanation,Yes,Yes,1347613
2,2013-12-20,Credit card,,Other,,,,Citibank,SC,29203,,,Phone,2014-01-03,Closed with non-monetary relief,Yes,No,640394
3,2016-03-03,Debt collection,"Other (i.e. phone, health club, etc.)",Disclosure verification of debt,Not given enough info to verify debt,,Company has responded to the consumer and the ...,"FAIR COLLECTIONS & OUTSOURCING, INC.",OH,43082,,,Referral,2016-03-04,Closed with explanation,Yes,No,1815134
4,2015-01-30,Debt collection,Medical,Disclosure verification of debt,Not given enough info to verify debt,,,"HCFS Health Care Financial Services, Inc.",CA,90036,,,Web,2015-01-30,Closed with explanation,Yes,Yes,1218613


In [4]:
test_data.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Complaint ID
0,2015-01-17,Credit card,,Customer service / Customer relations,,,,Citibank,TX,75241,,,Web,2015-01-17,Closed with monetary relief,Yes,1198133
1,2016-06-22,Consumer Loan,Title loan,Payment to acct not credited,,,Company believes it acted appropriately as aut...,"Larsen MacColl Partners II, L.P.",TX,76548,Servicemember,,Phone,2016-06-22,Closed with explanation,Yes,1979989
2,2015-09-04,Credit card,,Credit line increase/decrease,,I WANT TO REQUEST A CREDIT LINE INCREASE OF XX...,,Capital One,NC,271XX,,Consent provided,Web,2015-09-04,Closed with explanation,Yes,1552090
3,2016-05-17,Consumer Loan,Installment loan,Problems when you are unable to pay,,I have asked One Main Financial not to call my...,,"OneMain Financial Holdings, LLC",MO,634XX,,Consent provided,Web,2016-05-20,Closed with non-monetary relief,Yes,1929306
4,2016-07-07,Debt collection,"Other (i.e. phone, health club, etc.)",Improper contact or sharing of info,Contacted employer after asked not to,I have received several calls from a XXXX XXXX...,Company has responded to the consumer and the ...,"GMA Investments, LLC",SC,296XX,,Consent provided,Web,2016-07-07,Closed with explanation,Yes,2001667


In [5]:
print('Columns of train data')
print(train_data.columns)
print('\nColumns of test data')
print(test_data.columns)

Columns of train data
Index(['Date received', 'Product', 'Sub-product', 'Issue', 'Sub-issue',
       'Consumer complaint narrative', 'Company public response', 'Company',
       'State', 'ZIP code', 'Tags', 'Consumer consent provided?',
       'Submitted via', 'Date sent to company', 'Company response to consumer',
       'Timely response?', 'Consumer disputed?', 'Complaint ID'],
      dtype='object')

Columns of test data
Index(['Date received', 'Product', 'Sub-product', 'Issue', 'Sub-issue',
       'Consumer complaint narrative', 'Company public response', 'Company',
       'State', 'ZIP code', 'Tags', 'Consumer consent provided?',
       'Submitted via', 'Date sent to company', 'Company response to consumer',
       'Timely response?', 'Complaint ID'],
      dtype='object')


In [6]:
# Calculate percentage of missing values in train data
train_missing_percentage = (train_data.isnull().sum() / len(train_data)) * 100

# Calculate percentage of missing values in test data
test_missing_percentage = (test_data.isnull().sum() / len(test_data)) * 100

# Print missing values percentage for train data
print("Missing values percentage for train data:")
print(train_missing_percentage)

# Print missing values percentage for test data
print("\nMissing values percentage for test data:")
print(test_missing_percentage)

Missing values percentage for train data:
Date received                    0.000000
Product                          0.000000
Sub-product                     28.925058
Issue                            0.000000
Sub-issue                       61.139322
Consumer complaint narrative    84.342688
Company public response         81.067696
Company                          0.000000
State                            0.809063
ZIP code                         0.811293
Tags                            86.002062
Consumer consent provided?      71.689752
Submitted via                    0.000000
Date sent to company             0.000000
Company response to consumer     0.000000
Timely response?                 0.000000
Consumer disputed?               0.000000
Complaint ID                     0.000000
dtype: float64

Missing values percentage for test data:
Date received                    0.000000
Product                          0.000000
Sub-product                     28.997709
Issue              

In [7]:
# Drop columns with missing values exceeding 25% from train data
train_data = train_data.drop(train_missing_percentage[train_missing_percentage > 25].index, axis=1)

# Drop columns with missing values exceeding 25% from test data
test_data = test_data.drop(test_missing_percentage[test_missing_percentage > 25].index, axis=1)

# Display the updated train and test data
print("Train data after dropping columns with more than 25% missing values:")
print(train_data.head())

print("\nTest data after dropping columns with more than 25% missing values:")
print(test_data.head())

Train data after dropping columns with more than 25% missing values:
  Date received                  Product  \
0    2015-10-14         Credit reporting   
1    2015-04-26  Bank account or service   
2    2013-12-20              Credit card   
3    2016-03-03          Debt collection   
4    2015-01-30          Debt collection   

                                    Issue  \
0  Incorrect information on credit report   
1                Deposits and withdrawals   
2                                   Other   
3         Disclosure verification of debt   
4         Disclosure verification of debt   

                                     Company State ZIP code Submitted via  \
0                                    Equifax    GA    30134           Web   
1                      Wells Fargo & Company    GA    319XX           Web   
2                                   Citibank    SC    29203         Phone   
3       FAIR COLLECTIONS & OUTSOURCING, INC.    OH    43082      Referral   
4  HCFS He

In [8]:
# Convert 'Date received' column in train data to datetime format
train_data['Date received'] = pd.to_datetime(train_data['Date received'])

# Extract year, month, and day into separate columns
train_data['Year received'] = train_data['Date received'].dt.year
train_data['Month received'] = train_data['Date received'].dt.month
train_data['Day received'] = train_data['Date received'].dt.day

print(train_data)


# Convert 'Date received' column in test data to datetime format
test_data['Date received'] = pd.to_datetime(test_data['Date received'])

# Extract year, month, and day into separate columns
test_data['Year received'] = test_data['Date received'].dt.year
test_data['Month received'] = test_data['Date received'].dt.month
test_data['Day received'] = test_data['Date received'].dt.day

test_data.head()

       Date received                  Product  \
0         2015-10-14         Credit reporting   
1         2015-04-26  Bank account or service   
2         2013-12-20              Credit card   
3         2016-03-03          Debt collection   
4         2015-01-30          Debt collection   
...              ...                      ...   
358805    2014-04-14         Credit reporting   
358806    2013-03-14                 Mortgage   
358807    2013-02-01              Credit card   
358808    2015-06-25          Debt collection   
358809    2015-05-12         Credit reporting   

                                           Issue  \
0         Incorrect information on credit report   
1                       Deposits and withdrawals   
2                                          Other   
3                Disclosure verification of debt   
4                Disclosure verification of debt   
...                                          ...   
358805    Incorrect information on credit repor

Unnamed: 0,Date received,Product,Issue,Company,State,ZIP code,Submitted via,Date sent to company,Company response to consumer,Timely response?,Complaint ID,Year received,Month received,Day received
0,2015-01-17,Credit card,Customer service / Customer relations,Citibank,TX,75241,Web,2015-01-17,Closed with monetary relief,Yes,1198133,2015,1,17
1,2016-06-22,Consumer Loan,Payment to acct not credited,"Larsen MacColl Partners II, L.P.",TX,76548,Phone,2016-06-22,Closed with explanation,Yes,1979989,2016,6,22
2,2015-09-04,Credit card,Credit line increase/decrease,Capital One,NC,271XX,Web,2015-09-04,Closed with explanation,Yes,1552090,2015,9,4
3,2016-05-17,Consumer Loan,Problems when you are unable to pay,"OneMain Financial Holdings, LLC",MO,634XX,Web,2016-05-20,Closed with non-monetary relief,Yes,1929306,2016,5,17
4,2016-07-07,Debt collection,Improper contact or sharing of info,"GMA Investments, LLC",SC,296XX,Web,2016-07-07,Closed with explanation,Yes,2001667,2016,7,7


In [9]:
# Convert 'Date sent to company' column in train data to datetime format
train_data['Date sent to company'] = pd.to_datetime(train_data['Date sent to company'])

# Convert 'Date sent to company' column in test data to datetime format
test_data['Date sent to company'] = pd.to_datetime(test_data['Date sent to company'])

In [10]:
# Calculate the number of days the complaint was with the company
train_data['Days held'] = (train_data['Date sent to company'] - train_data['Date received']).dt.days


test_data['Days held'] = (test_data['Date sent to company'] - test_data['Date received']).dt.days

In [11]:
# Convert "Days Held" column to integer type
train_data['Days held'] = train_data['Days held'].astype(int)


test_data['Days held'] = test_data['Days held'].astype(int)

train_data.head()

Unnamed: 0,Date received,Product,Issue,Company,State,ZIP code,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID,Year received,Month received,Day received,Days held
0,2015-10-14,Credit reporting,Incorrect information on credit report,Equifax,GA,30134,Web,2015-10-14,Closed with explanation,Yes,No,1605653,2015,10,14,0
1,2015-04-26,Bank account or service,Deposits and withdrawals,Wells Fargo & Company,GA,319XX,Web,2015-04-26,Closed with explanation,Yes,Yes,1347613,2015,4,26,0
2,2013-12-20,Credit card,Other,Citibank,SC,29203,Phone,2014-01-03,Closed with non-monetary relief,Yes,No,640394,2013,12,20,14
3,2016-03-03,Debt collection,Disclosure verification of debt,"FAIR COLLECTIONS & OUTSOURCING, INC.",OH,43082,Referral,2016-03-04,Closed with explanation,Yes,No,1815134,2016,3,3,1
4,2015-01-30,Debt collection,Disclosure verification of debt,"HCFS Health Care Financial Services, Inc.",CA,90036,Web,2015-01-30,Closed with explanation,Yes,Yes,1218613,2015,1,30,0


In [12]:
# Drop specified columns from train_data
train_data.drop(columns=['Date received', 'Date sent to company', 'ZIP code', 'Complaint ID'], inplace=True)

# If you want to include both train and test data:
test_data.drop(columns=['Date received', 'Date sent to company', 'ZIP code', 'Complaint ID'], inplace=True)


In [13]:
# Define the bins for categorizing days into weeks
bins = [-float('inf'), 7, 14, 21, 28, float('inf')]
labels = ['Week 1', 'Week 2', 'Week 3', 'Week 4', 'Week 5+']

# Categorize days into weeks
train_data['Weeks received'] = pd.cut(train_data['Day received'], bins=bins, labels=labels, right=False)

# If you want to include both train and test data:
test_data['Weeks received'] = pd.cut(test_data['Day received'], bins=bins, labels=labels, right=False)

In [14]:
# Drop the "Day received" column from train_data
train_data.drop(columns=['Day received'], inplace=True)

# If you want to include both train and test data:
test_data.drop(columns=['Day received'], inplace=True)

In [15]:
disputed_cons = train_data[train_data['Consumer disputed?'] == 'Yes'].copy()

In [16]:
days_held_description = disputed_cons['Days held'].describe()
print(days_held_description)

count    76172.000000
mean         3.638962
std         14.254591
min         -1.000000
25%          0.000000
50%          1.000000
75%          3.000000
max        631.000000
Name: Days held, dtype: float64


In [17]:
# Convert negative values in the "Days Held" column to zero
disputed_cons['Days held'] = disputed_cons['Days held'].apply(lambda x: max(0, x))


In [18]:
train_data

Unnamed: 0,Product,Issue,Company,State,Submitted via,Company response to consumer,Timely response?,Consumer disputed?,Year received,Month received,Days held,Weeks received
0,Credit reporting,Incorrect information on credit report,Equifax,GA,Web,Closed with explanation,Yes,No,2015,10,0,Week 3
1,Bank account or service,Deposits and withdrawals,Wells Fargo & Company,GA,Web,Closed with explanation,Yes,Yes,2015,4,0,Week 4
2,Credit card,Other,Citibank,SC,Phone,Closed with non-monetary relief,Yes,No,2013,12,14,Week 3
3,Debt collection,Disclosure verification of debt,"FAIR COLLECTIONS & OUTSOURCING, INC.",OH,Referral,Closed with explanation,Yes,No,2016,3,1,Week 1
4,Debt collection,Disclosure verification of debt,"HCFS Health Care Financial Services, Inc.",CA,Web,Closed with explanation,Yes,Yes,2015,1,0,Week 5+
...,...,...,...,...,...,...,...,...,...,...,...,...
358805,Credit reporting,Incorrect information on credit report,Experian,OR,Web,Closed with non-monetary relief,Yes,No,2014,4,-1,Week 3
358806,Mortgage,"Loan modification,collection,foreclosure",Citibank,OH,Referral,Closed with non-monetary relief,Yes,No,2013,3,4,Week 3
358807,Credit card,Payoff process,Capital One,TX,Web,Closed,Yes,Yes,2013,2,0,Week 1
358808,Debt collection,Cont'd attempts collect debt not owed,Encore Capital Group,FL,Phone,Closed with non-monetary relief,Yes,No,2015,6,4,Week 4


In [19]:
test_data

Unnamed: 0,Product,Issue,Company,State,Submitted via,Company response to consumer,Timely response?,Year received,Month received,Days held,Weeks received
0,Credit card,Customer service / Customer relations,Citibank,TX,Web,Closed with monetary relief,Yes,2015,1,0,Week 3
1,Consumer Loan,Payment to acct not credited,"Larsen MacColl Partners II, L.P.",TX,Phone,Closed with explanation,Yes,2016,6,0,Week 4
2,Credit card,Credit line increase/decrease,Capital One,NC,Web,Closed with explanation,Yes,2015,9,0,Week 1
3,Consumer Loan,Problems when you are unable to pay,"OneMain Financial Holdings, LLC",MO,Web,Closed with non-monetary relief,Yes,2016,5,3,Week 3
4,Debt collection,Improper contact or sharing of info,"GMA Investments, LLC",SC,Web,Closed with explanation,Yes,2016,7,0,Week 2
...,...,...,...,...,...,...,...,...,...,...,...
119601,Credit card,Credit determination,Citibank,IL,Web,Closed with explanation,Yes,2015,2,0,Week 2
119602,Credit card,Balance transfer,Capital One,KS,Web,Closed with explanation,Yes,2012,11,1,Week 2
119603,Mortgage,"Loan modification,collection,foreclosure",Bank of America,CA,Referral,Closed without relief,Yes,2012,5,0,Week 3
119604,Credit reporting,Incorrect information on credit report,Experian,OH,Web,Closed with explanation,Yes,2015,6,0,Week 2


In [20]:
import string
import nltk
nltk.download('punkt')
from nltk.tokenize import wordpunct_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer, PorterStemmer
nltk.download('wordnet')

[nltk_data] Downloading package punkt to C:\Users\Samapan
[nltk_data]     Kar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Samapan
[nltk_data]     Kar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Samapan
[nltk_data]     Kar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [21]:
relevant_text_train = train_data['Issue']
relevant_text_test = test_data['Issue']

# tokenize the datasets
tokenized_data_train = relevant_text_train.apply(lambda x: wordpunct_tokenize(x.lower()))
tokenized_data_test = relevant_text_test.apply(lambda x: wordpunct_tokenize(x.lower()))

# remove punctutions
def remove_punctuation(text):
    no_punctuation = []
    for w in text:
        if w not in string.punctuation:
            no_punctuation.append(w)
    return no_punctuation
no_punctuation_data_train = tokenized_data_train.apply(lambda x: remove_punctuation(x))
no_punctuation_data_test = tokenized_data_test.apply(lambda x: remove_punctuation(x))

# filtering stopwords
stop_words = stopwords.words('english')
filtered_sentence_train = [w for w in no_punctuation_data_train if not w in stop_words]
filtered_sentence_train = pd.Series(filtered_sentence_train)
filtered_sentence_test = [w for w in no_punctuation_data_test if not w in stop_words]
filtered_sentence_test = pd.Series(filtered_sentence_test)

# lemmatizing the datasets
def lemmatize_text(text):
    lem_text = [WordNetLemmatizer().lemmatize(w,pos = 'v') for w in text]
    return lem_text
lemmatized_data_train = filtered_sentence_train.apply(lambda x:lemmatize_text(x))
lemmatized_data_test = filtered_sentence_test.apply(lambda x:lemmatize_text(x))

# Stemming the datasets
def stem_text(text):
    stem_text = [PorterStemmer().stem(w) for w in text]
    return stem_text
stemmed_data_train = lemmatized_data_train.apply(lambda x:stem_text(x))
stemmed_data_test = lemmatized_data_test.apply(lambda x:stem_text(x))

# making sentences joining the stemmed words
def word_to_sentence(text):
    text_sentence = " ".join(text)
    return text_sentence
clean_data_train = stemmed_data_train.apply(lambda x:word_to_sentence(x))
clean_data_test = stemmed_data_test.apply(lambda x:word_to_sentence(x))

In [22]:
# importing necessary libraries
from collections import Counter

# checking all the tokenized words and total count of them
all_words_train = Counter([word for sentence in tokenized_data_train for word in sentence])
all_words_test = Counter([word for sentence in tokenized_data_test for word in sentence])

print("Total number of unique words in training data: ", len(all_words_train))
print("Total number of unique words in test data: ", len(all_words_test))

# checking the most common words in the training data
print("Most common words in training data: ", all_words_train.most_common())

# checking the most common words in the test data
print("Most common words in test data: ", all_words_test.most_common())

Total number of unique words in training data:  201
Total number of unique words in test data:  200
Most common words in training data:  [(',', 253982), ('loan', 113938), ('credit', 75055), ('account', 61863), ('collection', 61295), ('modification', 60185), ('foreclosure', 60185), ('or', 53281), ('report', 51612), ('incorrect', 43924), ('information', 43825), ('on', 43825), ('payments', 41961), ('debt', 41370), ("'", 39149), ('servicing', 38456), ('escrow', 38456), ('/', 31791), ('not', 28096), ('cont', 27266), ('d', 27266), ('attempts', 27266), ('collect', 27266), ('owed', 27266), ('closing', 21209), ('of', 20828), ('opening', 18124), ('management', 17557), ('and', 17184), ('my', 15920), ('disclosure', 12283), ('verification', 12283), ('communication', 11457), ('tactics', 11457), ('deposits', 11028), ('withdrawals', 11028), ('problems', 10173), ('to', 10156), ('unable', 9725), ('lease', 9497), ('other', 9160), ('the', 9029), ('application', 8680), ('billing', 8542), ('originator', 844

In [23]:
train_data['Issues_cleaned'] = clean_data_train
test_data['Issues_cleaned'] = clean_data_test
train_data = train_data.drop('Issue', axis = 1)
test_data = test_data.drop('Issue', axis = 1)

In [24]:
# Define the list of columns to be dropped
columns_to_drop = ['Company', 'State', 'Year received', 'Days held']

# Drop the unnecessary columns
train_data.drop(columns=columns_to_drop, inplace=True)
test_data.drop(columns=columns_to_drop, inplace=True)

In [25]:
train_data['Consumer disputed?'] = train_data['Consumer disputed?'].replace({'Yes': 1, 'No': 0})
train_data.head()

Unnamed: 0,Product,Submitted via,Company response to consumer,Timely response?,Consumer disputed?,Month received,Weeks received,Issues_cleaned
0,Credit reporting,Web,Closed with explanation,Yes,0,10,Week 3,incorrect inform on credit report
1,Bank account or service,Web,Closed with explanation,Yes,1,4,Week 4,deposit and withdraw
2,Credit card,Phone,Closed with non-monetary relief,Yes,0,12,Week 3,other
3,Debt collection,Referral,Closed with explanation,Yes,0,3,Week 1,disclosur verif of debt
4,Debt collection,Web,Closed with explanation,Yes,1,1,Week 5+,disclosur verif of debt


In [26]:
# List of categorical features
categorical_features = ['Product', 'Submitted via', 'Company response to consumer', 'Timely response?','Weeks received']

# Create dummy variables for categorical features
dummy_variables_train = pd.get_dummies(train_data[categorical_features],dtype=int)
dummy_variables_test = pd.get_dummies(test_data[categorical_features],dtype=int)

In [27]:
# Concatenate the dummy variables with the original DataFrame
train_data = pd.concat([train_data, dummy_variables_train], axis=1)

# Drop the original categorical columns
train_data.drop(columns=categorical_features, inplace=True)

# Concatenate the dummy variables with the original DataFrame
test_data = pd.concat([test_data, dummy_variables_test], axis=1)

# Drop the original categorical columns
test_data.drop(columns=categorical_features, inplace=True)

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer()
issues_cleaned_train = tf.fit_transform(train_data['Issues_cleaned']).toarray()
issues_cleaned_test = tf.fit_transform(test_data['Issues_cleaned']).toarray()
tf_columns_train = []
tf_columns_test = []
for i in range(issues_cleaned_train.shape[1]):
    tf_columns_train.append('Feature' + str(i+1))
for i in range(issues_cleaned_test.shape[1]):
    tf_columns_test.append('Feature' + str(i+1))
issues_train = pd.DataFrame(issues_cleaned_train, columns = tf_columns_train)
issues_test = pd.DataFrame(issues_cleaned_test, columns = tf_columns_test)
weights = pd.DataFrame(tf.idf_, index = tf.get_feature_names_out(), columns = ['Idf_weights']).sort_values(by = 'Idf_weights', ascending = False)
#weights.head()
weights

Unnamed: 0,Idf_weights
exchang,11.998819
damag,11.593354
destroy,11.593354
overdraft,11.305672
featur,11.305672
...,...
modif,2.782596
account,2.745228
credit,2.628233
collect,2.396302


In [29]:
train_data = train_data.drop('Issues_cleaned', axis = 1)
test_data = test_data.drop('Issues_cleaned', axis = 1)
train_data = pd.concat([train_data, issues_train], axis = 1)
test_data = pd.concat([test_data, issues_test], axis = 1)
Feature168 = [0] * 119606
test_data['Feature168'] = Feature168

In [30]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X = train_data.iloc[:,1:]
y = train_data.iloc[:,0]

X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.25,random_state=42)



# Initialize the StandardScaler
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

test_data_scaled = scaler.transform(test_data)

In [31]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to balance the training set
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [32]:
from sklearn.decomposition import PCA
# Initialize PCA with n_components=None to retain all components
pca = PCA(n_components=None)

# Fit PCA on the scaled training data
pca.fit(X_train)
X_train.shape

(423918, 200)

In [33]:
# Calculate cumulative explained variance ratio
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)

# Find the number of components required to cover 80% of the information
n_components = np.argmax(cumulative_variance >= 0.8) + 1

# Initialize PCA with the selected number of components
pca = PCA(n_components=n_components)

# Fit PCA on the scaled training data
pca.fit(X_train)

# Transform the training and testing data using the selected number of components
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
test_data_scaled_pca = pca.transform(test_data_scaled)

# Get the top n features
top_n_features = pca.components_[:n_components, :]

# Create a new DataFrame with the top features
top_features_df = pd.DataFrame(top_n_features, columns=X.columns)

# Optionally, you can concatenate the PCA-transformed features with the dependent variable (if needed)
top_features_df = pd.concat([y, top_features_df], axis=1)

In [34]:
X_train_pca.shape,X_test_pca.shape,y_train.shape,y_test.shape

((423918, 55), (89703, 55), (423918,), (89703,))

In [35]:
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, classification_report, f1_score, precision_score, recall_score

In [36]:
from sklearn.linear_model import LogisticRegression
# Initialize the Logistic Regression model
logistic_regression_model = LogisticRegression()

logistic_regression_model.fit(X_train_pca,y_train)

y_test_pred = logistic_regression_model.predict(X_test_pca)

logistic_accuracy = accuracy_score(y_test, y_test_pred)
print("Accuracy:", logistic_accuracy)

Accuracy: 0.5662575387668194


In [37]:
# Calculate ROC AUC score (if it's a binary classification problem)
roc_auc = roc_auc_score(y_test, y_test_pred)
print("ROC AUC Score:", roc_auc)

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_test_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Calculate F1 score
f1 = f1_score(y_test, y_test_pred)
print("F1 Score:", f1)

# Precision and Recall
precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
print("Precision:", precision)
print("Recall:", recall)

ROC AUC Score: 0.5657872526318096
Confusion Matrix:
[[40047 30632]
 [ 8276 10748]]
F1 Score: 0.35587047215416195
Precision: 0.2597390043499275
Recall: 0.5649705634987384


In [38]:
from sklearn.tree import DecisionTreeClassifier
# Initialize the Decision Tree Classifier model
decision_tree_model = DecisionTreeClassifier()

# Train the model on the training data
decision_tree_model.fit(X_train_pca, y_train)

# Predict on the validation set
y_test_pred = decision_tree_model.predict(X_test_pca)

# Calculate validation accuracy
decision_tree_accuracy = accuracy_score(y_test, y_test_pred)
print("Accuracy:", decision_tree_accuracy)

Accuracy: 0.5787877774433408


In [39]:
# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_test_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Classification report
class_report = classification_report(y_test, y_test_pred)
print("Classification Report:")
print(class_report)

# Calculate F1 score
f1 = f1_score(y_test, y_test_pred)
print("F1 Score:", f1)

# Precision and Recall
precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
print("Precision:", precision)
print("Recall:", recall)

Confusion Matrix:
[[41848 28831]
 [ 8953 10071]]
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.59      0.69     70679
           1       0.26      0.53      0.35     19024

    accuracy                           0.58     89703
   macro avg       0.54      0.56      0.52     89703
weighted avg       0.70      0.58      0.62     89703

F1 Score: 0.3477195041950074
Precision: 0.258881291450311
Recall: 0.5293839360807401


In [40]:
from sklearn.ensemble import RandomForestClassifier
# Initialize the Random Forest Classifier model
random_forest_model = RandomForestClassifier()

# Train the model on the training data
random_forest_model.fit(X_train_pca, y_train)

# Predict on the validation set
y_test_pred = random_forest_model.predict(X_test_pca)

# Calculate validation accuracy
random_forest_accuracy = accuracy_score(y_test, y_test_pred)
print("Accuracy:", random_forest_accuracy)

Accuracy: 0.571708861465057


In [41]:
# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_test_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Classification report
class_report = classification_report(y_test, y_test_pred)
print("Classification Report:")
print(class_report)

# Calculate F1 score
f1 = f1_score(y_test, y_test_pred)
print("F1 Score:", f1)

# Precision and Recall
precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
print("Precision:", precision)
print("Recall:", recall)

Confusion Matrix:
[[40974 29705]
 [ 8714 10310]]
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.58      0.68     70679
           1       0.26      0.54      0.35     19024

    accuracy                           0.57     89703
   macro avg       0.54      0.56      0.52     89703
weighted avg       0.70      0.57      0.61     89703

F1 Score: 0.3492606582089805
Precision: 0.25765337998250654
Recall: 0.5419470142977292


In [42]:
from sklearn.ensemble import AdaBoostClassifier
# Initialize the AdaBoost Classifier model
adaboost_model = AdaBoostClassifier()

# Train the model on the training data
adaboost_model.fit(X_train_pca, y_train)

# Predict on the validation set
y_test_pred = adaboost_model.predict(X_test_pca)

# Calculate validation accuracy
ada_boost_accuracy = accuracy_score(y_test, y_test_pred)
print("Accuracy:", ada_boost_accuracy)



Accuracy: 0.5263926513048616


In [43]:
# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_test_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Classification report
class_report = classification_report(y_test, y_test_pred)
print("Classification Report:")
print(class_report)

# Calculate F1 score
f1 = f1_score(y_test, y_test_pred)
print("F1 Score:", f1)

# Precision and Recall
precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
print("Precision:", precision)
print("Recall:", recall)

Confusion Matrix:
[[34703 35976]
 [ 6508 12516]]
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.49      0.62     70679
           1       0.26      0.66      0.37     19024

    accuracy                           0.53     89703
   macro avg       0.55      0.57      0.50     89703
weighted avg       0.72      0.53      0.57     89703

F1 Score: 0.37075656140766633
Precision: 0.2581044295966345
Recall: 0.657905803195963


In [44]:
from sklearn.ensemble import GradientBoostingClassifier
# Initialize the Gradient Boosting Classifier model
gradient_boosting_model = GradientBoostingClassifier()

# Train the model on the training data
gradient_boosting_model.fit(X_train_pca, y_train)

# Predict on the validation set
y_test_pred = gradient_boosting_model.predict(X_test_pca)

# Calculate validation accuracy
gradient_boosting_accuracy = accuracy_score(y_test, y_test_pred)
print("Accuracy:", gradient_boosting_accuracy)

Accuracy: 0.5315764244228175


In [45]:
# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_test_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Classification report
class_report = classification_report(y_test, y_test_pred)
print("Classification Report:")
print(class_report)

# Calculate F1 score
f1 = f1_score(y_test, y_test_pred)
print("F1 Score:", f1)

# Precision and Recall
precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
print("Precision:", precision)
print("Recall:", recall)

Confusion Matrix:
[[35070 35609]
 [ 6410 12614]]
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.50      0.63     70679
           1       0.26      0.66      0.38     19024

    accuracy                           0.53     89703
   macro avg       0.55      0.58      0.50     89703
weighted avg       0.72      0.53      0.57     89703

F1 Score: 0.3751542819754041
Precision: 0.2615764261866744
Recall: 0.6630571909167368


In [46]:
from sklearn.neighbors import KNeighborsClassifier
# Initialize the KNN classifier model
knn_model = KNeighborsClassifier(n_neighbors=7,weights='distance')

# Fit the model to the training data
knn_model.fit(X_train_pca, y_train)

# Predict the labels of the test data
y_test_pred = knn_model.predict(X_test_pca)

# Calculate the accuracy of the model
knn_accuracy = accuracy_score(y_test, y_test_pred)
print("Accuracy:", knn_accuracy)

Accuracy: 0.7440330869647614


In [47]:
# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_test_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Classification report
class_report = classification_report(y_test, y_test_pred)
print("Classification Report:")
print(class_report)

# Calculate F1 score
f1 = f1_score(y_test, y_test_pred)
print("F1 Score:", f1)

# Precision and Recall
precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
print("Precision:", precision)
print("Recall:", recall)

Confusion Matrix:
[[64818  5861]
 [17100  1924]]
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.92      0.85     70679
           1       0.25      0.10      0.14     19024

    accuracy                           0.74     89703
   macro avg       0.52      0.51      0.50     89703
weighted avg       0.68      0.74      0.70     89703

F1 Score: 0.14353388787347532
Precision: 0.24714193962748876
Recall: 0.10113540790580319


In [48]:
from xgboost import XGBClassifier
# Initialize the XGBoost classifier model
xgb_model = XGBClassifier()

# Fit the model to the training data
xgb_model.fit(X_train_pca, y_train)

# Predict the labels of the test data
y_test_pred = xgb_model.predict(X_test_pca)

# Calculate the accuracy of the model
xgboost_accuracy = accuracy_score(y_test, y_test_pred)
print("Accuracy:", xgboost_accuracy)

Accuracy: 0.541431167296523


In [49]:
# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_test_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Classification report
class_report = classification_report(y_test, y_test_pred)
print("Classification Report:")
print(class_report)

# Calculate F1 score
f1 = f1_score(y_test, y_test_pred)
print("F1 Score:", f1)

# Precision and Recall
precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
print("Precision:", precision)
print("Recall:", recall)

Confusion Matrix:
[[36399 34280]
 [ 6855 12169]]
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.51      0.64     70679
           1       0.26      0.64      0.37     19024

    accuracy                           0.54     89703
   macro avg       0.55      0.58      0.51     89703
weighted avg       0.72      0.54      0.58     89703

F1 Score: 0.3717257495456142
Precision: 0.2619862645051562
Recall: 0.6396656854499579


In [50]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score

# Define the parameter grid
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11, 13, 15],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

# Initialize the kNN model
knn = KNeighborsClassifier()

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit GridSearchCV
grid_search.fit(X_train_pca, y_train)

# Best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Predict using the best model
best_knn = grid_search.best_estimator_
y_pred = best_knn.predict(X_test_pca)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

Best Parameters: {'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}
Best Score: 0.5698909463740982
Accuracy: 0.7158400499425883
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.86      0.83     70679
           1       0.25      0.17      0.20     19024

    accuracy                           0.72     89703
   macro avg       0.52      0.52      0.52     89703
weighted avg       0.68      0.72      0.69     89703

