In [29]:
# 1- Import Libraries:

# Data Manipulation:
# pandas: Provides data structures like DataFrames, which are useful for handling and processing structured data.
    
import pandas as pd
    
# Feature Extraction:
# CountVectorizer: Converts a collection of text documents to a matrix of token counts.
# TfidfVectorizer: Converts a collection of raw documents to a matrix of TF-IDF features, which reflect the importance of words in the documents.

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split

# Model Training:
# MultinomialNB: Implements the Multinomial Naive Bayes algorithm, which is suitable for classification with discrete features (like word counts for text classification).

from sklearn.naive_bayes import MultinomialNB

# Model Evaluation:
# accuracy_score: Calculates the accuracy of the model by comparing the predicted labels with the true labels.

from sklearn.metrics import accuracy_score

# 2- Load Dataset

2a - Define the file path once

In [30]:
file_path = "D:/OneDrive - Royal HaskoningDHV/920791/Pri 3/ironhack/nlp-project/Project-2-NLP/dataset/training_data_lowercase.csv"

2b- Load the dataset

In [31]:
data = pd.read_csv(file_path)

2c- Install the chardet library detect the encoding programmatically

In [32]:
%pip install chardet

Note: you may need to restart the kernel to use updated packages.


2d- Detect the encoding

In [33]:
import chardet

# Read the first few bytes of the file to detect the encoding
with open(file_path, 'rb') as file:
    raw_data = file.read(10000)
    result = chardet.detect(raw_data)
    encoding = result['encoding']
    print(f"The detected encoding is: {encoding}")

The detected encoding is: UTF-8-SIG


2e- Load the dataset with the correct encoding to handle BOM

In [34]:
data = pd.read_csv(file_path, encoding='utf-8-sig')

2f- Display the column names to verify they are correctly parsed

In [35]:
print(data.columns)

Index(['0\tdonald trump sends out embarrassing new year‚s eve message; this is disturbing'], dtype='object')


2g- Display the first few rows of the dataset

In [36]:
print(data.head())

  0\tdonald trump sends out embarrassing new year‚s eve message; this is disturbing
0  0\tdrunk bragging trump staffer started russia...                               
1  0\tsheriff david clarke becomes an internet jo...                               
2  0\ttrump is so obsessed he even has obama‚s na...                               
3  0\tpope francis just called out donald trump d...                               
4  0\tracist alabama cops brutalize black boy whi...                               


2h- Load the dataset with the correct encoding to handle BOM

In [49]:
data = pd.read_csv(file_path, encoding='utf-8-sig', header=None)

2i- Display the first few rows of the dataset

In [50]:
print(data.head())

                                                   0
0  0\tdonald trump sends out embarrassing new yea...
1  0\tdrunk bragging trump staffer started russia...
2  0\tsheriff david clarke becomes an internet jo...
3  0\ttrump is so obsessed he even has obama‚s na...
4  0\tpope francis just called out donald trump d...


2j- Separate the first part of the sentences with the separator '0\t' and assign it as the label column

In [51]:
data[['label', 'text']] = data[0].str.split('\t', n=1, expand=True)

Drop the original column

In [52]:
data = data.drop(columns=[0])

2k- Display the first few rows of the dataset after removing the first part

In [53]:
print(data.head())

  label                                               text
0     0  donald trump sends out embarrassing new year‚s...
1     0  drunk bragging trump staffer started russian c...
2     0  sheriff david clarke becomes an internet joke ...
3     0  trump is so obsessed he even has obama‚s name ...
4     0  pope francis just called out donald trump duri...


# 3- Preprocess Data

In [54]:
# 3a- Check for missing values in the dataset

print(data.isnull().sum())

label    0
text     0
dtype: int64


Output 0    0: Means that the column 0 (the only column in your DataFrame) has zero missing values.

In [55]:
# Drop rows with missing values in the 'label' column

data = data.dropna(subset=['label'])

In [56]:
# 3b- Check Data After Preprocessing

print(data.isnull().sum())
print(data.head())
print(data.shape)

label    0
text     0
dtype: int64
  label                                               text
0     0  donald trump sends out embarrassing new year‚s...
1     0  drunk bragging trump staffer started russian c...
2     0  sheriff david clarke becomes an internet joke ...
3     0  trump is so obsessed he even has obama‚s name ...
4     0  pope francis just called out donald trump duri...
(34152, 2)


In [57]:
# 4- Split Data: We split the dataset into training and testing sets. The training set is used to train the model, and the testing set is used to evaluate the model’s performance.

X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size=0.2, random_state=42)

In [None]:
# 5- Feature Extraction: We use two methods to convert the text data into numerical features:

In [58]:
# 5a- Method 1: Count Vectorizer: Converts text into a matrix of token counts.

count_vectorizer = CountVectorizer()
X_train_count = count_vectorizer.fit_transform(X_train)
X_test_count = count_vectorizer.transform(X_test)

In [59]:
# 5b- Method 2: TF-IDF Vectorizer: Converts text into a matrix of TF-IDF features, which reflect the importance of words in the documents.

tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [None]:
# 6- Train and Evaluate Model:

In [60]:
# 6a- Using Count Vectorizer features
model_count = MultinomialNB()
model_count.fit(X_train_count, y_train)
y_pred_count = model_count.predict(X_test_count)
accuracy_count = accuracy_score(y_test, y_pred_count)


In [61]:
# 6b- Using TF-IDF Vectorizer features
model_tfidf = MultinomialNB()
model_tfidf.fit(X_train_tfidf, y_train)
y_pred_tfidf = model_tfidf.predict(X_test_tfidf)
accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)

In [62]:
# 6c- Compare the accuracies
if accuracy_tfidf > accuracy_count:
    best_representation = "TF-IDF Vectorizer"
    best_accuracy = accuracy_tfidf
else:
    best_representation = "Count Vectorizer"
    best_accuracy = accuracy_count

print(f"The best feature representation is {best_representation} with an accuracy of {best_accuracy}.")

The best feature representation is Count Vectorizer with an accuracy of 0.9448104230712926.
