# Sample Team Project

- Used for MMAI 891

In [1]:
import datetime
print(datetime.datetime.now())

2022-02-15 11:48:17.992851


In [2]:
import pandas as pd

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
import sklearn
print('The scikit-learn version is {}.'.format(sklearn.__version__))

The scikit-learn version is 0.24.1.


# Read Data

In [4]:
df = pd.read_csv("../data/finance_train.csv")
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10466 entries, 0 to 10465
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        10466 non-null  int64 
 1   text      10466 non-null  object
 2   category  10466 non-null  object
dtypes: int64(1), object(2)
memory usage: 245.4+ KB


Unnamed: 0,id,text,category
0,5396,I made a purchase recently but I have decided ...,request_refund
1,2202,"Cancel my transfer, please.",cancel_transfer
2,3768,How do I change my last name?,edit_personal_details
3,4967,A withdraw I tried to make at the ATM was decl...,declined_cash_withdrawal
4,5622,I tried to buy something online yesterday and ...,declined_transfer


In [6]:
from sklearn.model_selection import train_test_split

X = df['text']
y = df['category']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape
y_train.shape
X_val.shape
y_val.shape

(8372,)

(8372,)

(2094,)

(2094,)

# Feature Engineering Pipeline

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline

vectorizer = CountVectorizer(min_df=.01, max_df=.8, ngram_range=[1,1], max_features=300, stop_words='english')

pipe = Pipeline([('vec', vectorizer),  ('clf', DecisionTreeClassifier(random_state=223))])

# Modeling Fitting

In [8]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('vec',
                 CountVectorizer(max_df=0.8, max_features=300, min_df=0.01,
                                 ngram_range=[1, 1], stop_words='english')),
                ('clf', DecisionTreeClassifier(random_state=223))])

# Model Assessment

In [9]:
from sklearn.metrics import confusion_matrix, classification_report

pred_val = pipe.predict(X_val)
print(confusion_matrix(y_val, pred_val))
print(classification_report(y_val, pred_val))

[[19  0  1 ...  0  0  0]
 [ 0 14  0 ...  0  0  0]
 [ 0  0 13 ...  0  0  0]
 ...
 [ 0  0  0 ...  6  0  0]
 [ 0  0  0 ...  0 14  0]
 [ 0  0  0 ...  0  1 17]]
                                                  precision    recall  f1-score   support

                           Refund_not_showing_up       0.56      0.70      0.62        27
                                activate_my_card       0.10      0.54      0.17        26
                                       age_limit       0.25      0.50      0.33        26
                         apple_pay_or_google_pay       0.62      0.67      0.64        27
                                     atm_support       0.67      0.38      0.48        16
                                automatic_top_up       0.38      0.10      0.15        31
         balance_not_updated_after_bank_transfer       0.57      0.47      0.52        36
balance_not_updated_after_cheque_or_cash_deposit       0.51      0.68      0.58        34
                         benefici

# Kaggle Predictions

In [10]:
df_test = pd.read_csv('../data/finance_test.csv')
df_test.info()
df_test.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2617 entries, 0 to 2616
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      2617 non-null   int64 
 1   text    2617 non-null   object
dtypes: int64(1), object(1)
memory usage: 41.0+ KB


Unnamed: 0,id,text
0,5037,The ATM cancelled a transaction on me when I w...
1,3461,where is my money from this morning
2,995,I would like to know which fiat currencies are...
3,2890,Why did I get a fee?
4,9274,I need my card to be activated right now.


In [11]:
pred_test = pipe.predict(df_test['text'])

In [12]:
pred_test

array(['declined_cash_withdrawal', 'verify_source_of_funds',
       'fiat_currency_support', ..., 'card_not_working',
       'transaction_charged_twice', 'direct_debit_payment_not_recognised'],
      dtype=object)

In [13]:
my_submission = pd.DataFrame({'Id': df_test['id'], 'Predicted': pred_test})
my_submission.head()

Unnamed: 0,Id,Predicted
0,5037,declined_cash_withdrawal
1,3461,verify_source_of_funds
2,995,fiat_currency_support
3,2890,extra_charge_on_statement
4,9274,activate_my_card


In [14]:
my_submission.to_csv('my_submission.csv', index=False)