<a href="https://colab.research.google.com/github/royn5618/Talks_Resources/blob/main/DSCF_NLP_Workshop_Day_2_Workbook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [None]:
import re

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from matplotlib import rcParams
plt.rcParams['figure.figsize'] = [10,10]
import seaborn as sns
sns.set_theme(style="darkgrid")

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

# Load Data

In [None]:
train_df_ = pd.read_csv('train_5k.csv')
test_df = pd.read_csv('test.csv')

In [None]:
train_df_.shape

(5000, 6)

In [None]:
train_df = train_df_[:100].copy()
train_df.shape

(100, 6)

In [None]:
# Check the first few rows
# .head()

train_df.head()

Unnamed: 0.1,Unnamed: 0,id,title,author,text,label
0,0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [None]:
# Check the 'label' distribution

train_df['label'].value_counts()

0    55
1    45
Name: label, dtype: int64

In [None]:
test_df.shape

(5200, 4)

In [None]:
# View Data Sample
# Use df.sample(n=X)

train_df.sample(n=5)

Unnamed: 0.1,Unnamed: 0,id,title,author,text,label
66,66,66,Jury finds all Oregon standoff defendants not ...,Admin,Oregon Live – by Maxine Bernstein \nA federal ...,1
80,80,80,"Louisiana, Simone Biles, U.S. Presidential Rac...",Andrea Kannapell and Sandra Stevenson,(Want to get this briefing by email? Here’s th...,0
17,17,17,Anonymous Donor Pays $2.5 Million To Release E...,Starkman,A Caddo Nation tribal leader has just been fre...,1
56,56,56,Ep. 544 FADE to BLACK Jimmy Church w/ Laura Ei...,Madeline,Click Here To Learn More About Alexandra's Per...,1
69,69,69,Bernie Sanders Says What The Media Won’t: Trum...,Jason Easley,"— Bernie Sanders (@BernieSanders) October 27, ...",1


In [None]:
# Investigate Null Values
# Use .info()

train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  100 non-null    int64 
 1   id          100 non-null    int64 
 2   title       99 non-null     object
 3   author      89 non-null     object
 4   text        100 non-null    object
 5   label       100 non-null    int64 
dtypes: int64(3), object(3)
memory usage: 4.8+ KB


In [None]:
# Drop Null Values
# Use dropna

train_df.dropna(axis=0, how='any', inplace=True)

# Data Preprocessing

Here's a list of things you could do:

1. Tokenize
2. Remove Stopwords
3. Remove punctuations, single characters
4. Apply Lemma
5. Pos tags
  
Recommend choosing a sample text and working on the logic for that. Take a sample of about 5 texts from the actual dataset and test for errors/performance.

In [None]:
train_df['text'][:1]

0    House Dem Aide: We Didn’t Even See Comey’s Let...
Name: text, dtype: object

In [17]:
lm = WordNetLemmatizer()

def preprocess_data(input_text):
    """
    Example: Financial Markets admin \nThe good news
    Steps:
    1. Tokenize
    2. Remove Stopwords
    3. Remove punctuations, single characters
    4. Apply Lemma
    5. Add a condition that if the length of the characters is greater than X, then only will train and predict.

     -- anything else you'd like --

    :param input_text: String
    :return: String
    """
    try:
        if len(input_text) > 100:
            _tokens = nltk.pos_tag(word_tokenize(input_text.lower()))
            output_tokens = []
            for each_token, pos_tag in _tokens:
                if len(each_token) > 1 and each_token not in stopwords:
                    _tag = pos_tag[0].lower()
                    if _tag in ['a', 'r', 'n', 'v']:
                        # print(f"{each_token}: {lm.lemmatize(each_token, pos=_tag)}")
                        _lemma = lm.lemmatize(each_token, pos=_tag)
                        output_tokens.append(_lemma)
            return " ".join(output_tokens)
        else:
            return None
    except TypeError as te:
        print(input_text)

In [18]:
train_df['text_cleaned'] = train_df['text'].apply(lambda x: preprocess_data(x))

# ML Imports

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# Split your data

Link - https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In [None]:
# use train_test_split

# Count Vectorizer

CountVectorizer - https://www.geeksforgeeks.org/using-countvectorizer-to-extracting-features-from-text/

Read more - https://www.geeksforgeeks.org/using-countvectorizer-to-extracting-features-from-text/

In [None]:
# Initialize CountVectorizer

In [None]:
# Fit CountVectorizer on train data

In [None]:
# Transform CountVectorizer on train data

In [None]:
# Transform CountVectorizer on test data

# Fit a Classifier

**Scikit-Learn Docs:**

Logistic Regression - https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

Available Metrics - https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics

In [None]:
# Try LogisticRegression first
# Initialize LogisticRegression


In [None]:
# Fit on train and target

In [None]:
# Predict on test data

In [None]:
# Compare results
# Try Accuracy / ClassificationReport

# Apply TF-IDF after Count Vectorizer

TfidfTransformer - https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html

Read more - https://www.geeksforgeeks.org/understanding-tf-idf-term-frequency-inverse-document-frequency/

In [None]:
# Initialize, fit and transform - CV and TFIDF
# Fit and Predict