##CSC 820  
### Homework 9  
Andrew Dahlstrom  
4/10/24  

Adapted from source:   
A Deep Dive Into Sklearn Pipelines  
https://www.kaggle.com/code/baghern/a-deep-dive-into-sklearn-pipelines  

In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/CSC 820/HW9/train.csv')

df.dropna(axis=0)
df.set_index('id', inplace = True)

df.head()

Unnamed: 0_level_0,text,author
id,Unnamed: 1_level_1,Unnamed: 2_level_1
id26305,"This process, however, afforded me no means of...",EAP
id17569,It never once occurred to me that the fumbling...,HPL
id11008,"In his left hand was a gold snuff box, from wh...",EAP
id27763,How lovely is spring As we looked from Windsor...,MWS
id12958,"Finding nothing else, not even gold, the Super...",HPL


In [4]:
import re
from nltk.corpus import stopwords
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

stopWords = set(stopwords.words('english'))

# Helper function to tokenize the text into words and identify the particles of speech (adjectives, nouns, verbs)
# Returns count of the particles of speech
def pos_count(text, pos_list):
    tokens = word_tokenize(text)
    pos_words = pos_tag(tokens)
    return sum(1 for word, tag in pos_words if tag in pos_list)

# creating a function to encapsulate preprocessing, to mkae it easy to replicate on  submission data
def processing(df):
    #lowering and removing punctuation
    df['processed'] = df['text'].apply(lambda x: re.sub(r'[^\w\s]','', x.lower()))

    #numerical feature engineering
    #total length of sentence
    df['length'] = df['processed'].apply(lambda x: len(x))
    #get number of words
    df['words'] = df['processed'].apply(lambda x: len(x.split(' ')))
    df['words_not_stopword'] = df['processed'].apply(lambda x: len([t for t in x.split(' ') if t not in stopWords]))
    #get the average word length
    df['avg_word_length'] = df['processed'].apply(lambda x: np.mean([len(t) for t in x.split(' ') if t not in stopWords]) if len([len(t) for t in x.split(' ') if t not in stopWords]) > 0 else 0)
    #get the average word length
    df['commas'] = df['text'].apply(lambda x: x.count(','))

    # Using the tags from the Penn Treebank tagset
    adjective_tags = ['JJ', 'JJR', 'JJS'] # ex big bigger biggest
    noun_tags = ['NN', 'NNS', 'NNP', 'NNPS'] # ex desk desks Harrison Americans
    verb_tags = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'] # ex run ran running ran running running

    # Add new feature columns using the particle of speech tags to count the adjectives, nouns, and verbs
    df['number_of_adjectives'] = df['processed'].apply(lambda x: pos_count(x, adjective_tags))
    df['number_of_nouns'] = df['processed'].apply(lambda x: pos_count(x, noun_tags))
    df['number_of_verbs'] = df['processed'].apply(lambda x: pos_count(x, verb_tags))

    return(df)

df = processing(df)

df.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Unnamed: 0_level_0,text,author,processed,length,words,words_not_stopword,avg_word_length,commas,number_of_adjectives,number_of_nouns,number_of_verbs
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
id26305,"This process, however, afforded me no means of...",EAP,this process however afforded me no means of a...,224,41,21,6.380952,4,2,12,6
id17569,It never once occurred to me that the fumbling...,HPL,it never once occurred to me that the fumbling...,70,14,6,6.166667,0,1,2,2
id11008,"In his left hand was a gold snuff box, from wh...",EAP,in his left hand was a gold snuff box from whi...,195,36,19,5.947368,4,5,10,4
id27763,How lovely is spring As we looked from Windsor...,MWS,how lovely is spring as we looked from windsor...,202,34,21,6.47619,3,6,10,5
id12958,"Finding nothing else, not even gold, the Super...",HPL,finding nothing else not even gold the superin...,170,27,16,7.1875,2,1,6,6


In [5]:
from sklearn.model_selection import train_test_split

features= [c for c in df.columns.values if c  not in ['id','text','author']]
numeric_features= [c for c in df.columns.values if c  not in ['id','text','author','processed']]
target = 'author'

X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.33, random_state=42)
X_train.head()

Unnamed: 0_level_0,processed,length,words,words_not_stopword,avg_word_length,commas,number_of_adjectives,number_of_nouns,number_of_verbs
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
id19417,this panorama is indeed glorious and i should ...,91,18,6,6.666667,1,1,4,2
id09522,there was a simple natural earnestness about h...,240,44,18,6.277778,4,7,8,7
id22732,who are you pray that i duc de lomelette princ...,387,74,38,5.552632,9,3,18,10
id10351,he had gone in the carriage to the nearest tow...,118,24,11,5.363636,0,1,8,3
id24580,there is no method in their proceedings beyond...,71,13,5,7.0,1,0,4,1


In [6]:
from sklearn.base import BaseEstimator, TransformerMixin

class TextSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on text columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]

class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

text = Pipeline([
                ('selector', TextSelector(key='processed')),
                ('tfidf', TfidfVectorizer(stop_words='english'))
            ])

text.fit_transform(X_train)

<13117x21516 sparse matrix of type '<class 'numpy.float64'>'
	with 148061 stored elements in Compressed Sparse Row format>

In [8]:
from sklearn.preprocessing import StandardScaler

length =  Pipeline([
                ('selector', NumberSelector(key='length')),
                ('standard', StandardScaler())
            ])

length.fit_transform(X_train)

array([[-0.50769254],
       [ 0.88000324],
       [ 2.24907223],
       ...,
       [-0.46112557],
       [-0.14447015],
       [-0.39593181]])

In [9]:
words =  Pipeline([
                ('selector', NumberSelector(key='words')),
                ('standard', StandardScaler())
            ])
words_not_stopword =  Pipeline([
                ('selector', NumberSelector(key='words_not_stopword')),
                ('standard', StandardScaler())
            ])
avg_word_length =  Pipeline([
                ('selector', NumberSelector(key='avg_word_length')),
                ('standard', StandardScaler())
            ])
commas =  Pipeline([
                ('selector', NumberSelector(key='commas')),
                ('standard', StandardScaler()),
            ])
number_of_adjectives =  Pipeline([
                ('selector', NumberSelector(key='number_of_adjectives')),
                ('standard', StandardScaler()),
            ])
number_of_nouns =  Pipeline([
                ('selector', NumberSelector(key='number_of_nouns')),
                ('standard', StandardScaler()),
            ])
number_of_verbs =  Pipeline([
                ('selector', NumberSelector(key='number_of_verbs')),
                ('standard', StandardScaler()),
            ])

In [10]:
from sklearn.pipeline import FeatureUnion

feats = FeatureUnion([('text', text),
                      ('length', length),
                      ('words', words),
                      ('words_not_stopword', words_not_stopword),
                      ('avg_word_length', avg_word_length),
                      ('commas', commas),
                      ])

feats2 = FeatureUnion([('text', text),
                      ('length', length),
                      ('words', words),
                      ('words_not_stopword', words_not_stopword),
                      ('avg_word_length', avg_word_length),
                      ('commas', commas),
                      ('number_of_adjectives', number_of_adjectives),
                      ('number_of_nouns', number_of_nouns),
                      ('number_of_verbs', number_of_verbs),
                      ])


feature_processing = Pipeline([('feats', feats)])
feature_processing.fit_transform(X_train)

feature_processing2 = Pipeline([('feats2', feats2)])
feature_processing2.fit_transform(X_train)

<13117x21524 sparse matrix of type '<class 'numpy.float64'>'
	with 252997 stored elements in Compressed Sparse Row format>

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# Using Logistic Regression instead of Random Forest, with same random state
# Applying l2 regularization to prevent overfitting
pipeline = Pipeline([
    ('features', feats),
    ('classifier', LogisticRegression(random_state=42, max_iter=3000, penalty='l2')),
])

pipeline2 = Pipeline([
    ('features2', feats2),
    ('classifier2', LogisticRegression(random_state=42, max_iter=3000, penalty='l2')),
])

pipeline.fit(X_train, y_train)

pipeline2.fit(X_train, y_train)

# Previous results before POS features
preds = pipeline.predict(X_test)
np.mean(preds == y_test)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

         EAP       0.74      0.85      0.79      2587
         HPL       0.81      0.74      0.77      1852
         MWS       0.83      0.73      0.78      2023

    accuracy                           0.78      6462
   macro avg       0.79      0.77      0.78      6462
weighted avg       0.79      0.78      0.78      6462



In [12]:
# New results after POS features
preds2 = pipeline2.predict(X_test)
np.mean(preds2 == y_test)
print(classification_report(y_test, preds2))

              precision    recall  f1-score   support

         EAP       0.74      0.84      0.79      2587
         HPL       0.81      0.74      0.78      1852
         MWS       0.81      0.73      0.77      2023

    accuracy                           0.78      6462
   macro avg       0.79      0.77      0.78      6462
weighted avg       0.78      0.78      0.78      6462



By adding the new particle of speech features to the logistic regression model, we can see that the precision, f1 score and recall have not changed much and slightly decreased for recall indicating a similar performance even with the newly added features. This could be the result of the increasing the completity of the model but not increasing the amount of training data or it could indicate that particles of speech are not important features for this classification.




In [13]:
from sklearn.model_selection import GridSearchCV

hyperparameters = { 'features2__text__tfidf__max_df': [0.9],
                    'features2__text__tfidf__ngram_range': [(1,2)],
                   'classifier2__C': [10]
                    }

cv_params = [2, 10, 20]

for cv in cv_params:
  clf = GridSearchCV(pipeline2, hyperparameters, cv=cv)
  # Fit and tune model
  clf.fit(X_train, y_train)
  print(f"Results of cross-validation with {cv} folds...")
  print(clf.best_score_, clf.best_params_)

Results of cross-validation with 2 folds...
0.7689263141361534 {'classifier2__C': 10, 'features2__text__tfidf__max_df': 0.9, 'features2__text__tfidf__ngram_range': (1, 2)}
Results of cross-validation with 10 folds...
0.8039961465833194 {'classifier2__C': 10, 'features2__text__tfidf__max_df': 0.9, 'features2__text__tfidf__ngram_range': (1, 2)}
Results of cross-validation with 20 folds...
0.8066617017315212 {'classifier2__C': 10, 'features2__text__tfidf__max_df': 0.9, 'features2__text__tfidf__ngram_range': (1, 2)}


In [18]:
classifier = pipeline2.named_steps['classifier2']
text_features = text.named_steps['tfidf'].get_feature_names_out()
number_features = ['length', 'words', 'words_not_stopword', 'avg_word_length', 'commas', 'number_of_adjectives', 'number_of_nouns', 'number_of_verbs']

feature_names = np.concatenate([text_features, number_features]).tolist()

# Access the coefficients of the model to determine feature importance
coefs = classifier.coef_
for author, weights in enumerate(coefs):
    # Sort features by their weights
    sorted_features = sorted(zip(feature_names, weights), key=lambda x: x[1], reverse=True)
    print(f"For author {author}, 10 most important features are: {sorted_features[:10]}")
    print(f"For author {author}, 10 least important features are: {sorted_features[-10]}")
    print("Weights for 'number_of_adjectives':", weights[feature_names.index('number_of_adjectives')])
    print("Weights for 'number_of_nouns':", weights[feature_names.index('number_of_nouns')])
    print("Weights for 'number_of_verbs':", weights[feature_names.index('number_of_verbs')])

For author 0, 10 most important features are: [('mr', 2.2070094585282383), ('madame', 2.1816709753846957), ('gentleman', 2.013190769453038), ('balloon', 1.8277815070923709), ('minutes', 1.8250997959150872), ('lady', 1.812820260592429), ('altogether', 1.7459538971408315), ('dupin', 1.743493286391173), ('matter', 1.7080606416540658), ('character', 1.630414026191853)]
For author 0, 10 least important features are: ('idris', -1.4816601217918326)
Weights for 'number_of_adjectives': -0.1366200198995124
Weights for 'number_of_nouns': -0.33189913786442116
Weights for 'number_of_verbs': -0.59923333365271
For author 1, 10 most important features are: [('west', 2.547314423704751), ('street', 2.3648550056767546), ('later', 2.341573650918013), ('gilman', 2.2706586066021557), ('innsmouth', 2.015739521163835), ('men', 1.9950113236870695), ('despite', 1.9409899611896941), ('ancient', 1.8742313921084737), ('outside', 1.8588647803148792), ('jermyn', 1.8395251614421153)]
For author 1, 10 least important 

In [19]:
#refitting on entire training data using best settings
clf.refit

preds = clf.predict(X_test)
probs = clf.predict_proba(X_test)

np.mean(preds == y_test)

0.797121634168988

In [23]:
# Use the trained best estimator to predict on test set
preds2 = clf.best_estimator_.predict(X_test)

# Print the accuracy on the test data
accuracy2 = np.mean(preds2 == y_test)
print("Accuracy on test data is ", accuracy2)

# Find indices where predictions are wrong
wrong_predictions = np.where(preds2 != y_test)[0]

# Print 10 wrong predictions with details
print("10 wrong predictions: ")
for i in wrong_predictions[:10]:
    print("Features:", X_test.iloc[i])
    print("Predicted author:", preds2[i])
    print("Actual author:", y_test.iloc[i])

print(classification_report(y_test, preds2))

Accuracy on test data is  0.797121634168988
10 wrong predictions: 
Features: processed               he had seen so many customs and witnessed so g...
length                                                                399
words                                                                  69
words_not_stopword                                                     33
avg_word_length                                                  6.909091
commas                                                                  1
number_of_adjectives                                                    8
number_of_nouns                                                        17
number_of_verbs                                                        11
Name: id16303, dtype: object
Predicted author: HPL
Actual author: MWS
Features: processed               she listened to me as she had done to the narr...
length                                                                279
words                        