# Q2.1. Data Preprocessing

### 1. Reading the csv format dataset

In [1]:
import os
# current directory
curr_dir = os.getcwd()

# dataset path

path = curr_dir+'\learn-ai-bbc'
print(path)   

C:\Users\Rohit Kesarwani\python_files\IR_Assignment_2\learn-ai-bbc


In [2]:
import pandas as pd
df_train= pd.read_csv(path+"\BBC News Train.csv")
df_train.head()

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business


In [3]:
df_train.shape

(1490, 3)

In [4]:
df_test=pd.read_csv(path+"\BBC News Test.csv")
df_test.head()

Unnamed: 0,ArticleId,Text
0,1018,qpr keeper day heads for preston queens park r...
1,1319,software watching while you work software that...
2,1138,d arcy injury adds to ireland woe gordon d arc...
3,459,india s reliance family feud heats up the ongo...
4,1020,boro suffer morrison injury blow middlesbrough...


In [5]:
df_test.shape

(735, 2)

### 2. Removing unnecessary columns

In [6]:
# Remove the 'ArticleId' column
data = df_train.drop(columns=['ArticleId'])

### 3 & 4. Text cleaning and performing stemming and lemmatization

In [7]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

# Download the stopwords and punkt tokenizer
nltk.download('stopwords')
nltk.download('punkt')

# Create a list of stopwords and punctuation marks
stopwords_list = stopwords.words('english') + list(string.punctuation)

# Initialize the WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# function to clean and tokenize the text
def clean_tokenize_text(text):
    # Convert the text to lowercase
    text = text.lower()

    # Tokenize the text
    words = word_tokenize(text)

    # Remove the stopwords and punctuation marks
    words = [word for word in words if word not in stopwords_list]

    # Lemmatize the words
    words = [lemmatizer.lemmatize(word) for word in words]

    return words

# Apply the clean_tokenize_text function to the 'Text' column
data['Text'] = data['Text'].apply(clean_tokenize_text)

[nltk_data] Downloading package stopwords to C:\Users\Rohit
[nltk_data]     Kesarwani\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Rohit
[nltk_data]     Kesarwani\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### 5. TFICF-Weighing Scheme

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert the list of words back to a string for the TfidfVectorizer
data['Text'] = data['Text'].apply(' '.join)

# Initialize the TfidfVectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the vectorizer on the 'Text' column
tfidf_matrix = vectorizer.fit_transform(data['Text'])

# Convert the sparse matrix to a dense matrix
tfidf_matrix = tfidf_matrix.toarray()

# Create a DataFrame from the dense matrix
tfidf_df = pd.DataFrame(tfidf_matrix, columns=vectorizer.get_feature_names())

# Add the 'Category' column to the DataFrame
tfidf_df['Category'] = data['Category']

# Group the DataFrame by category and calculate the mean of each feature
tficf_df = tfidf_df.groupby('Category').mean()



# Q2.2. Dataset Split

### Split the dataset into training and testing sets into 70:30

In [9]:
from sklearn.model_selection import train_test_split
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(tfidf_df.drop('Category', axis=1), tfidf_df['Category'], test_size=0.3, random_state=42)

# Q2.3. Training Naive-Bayes with TF-ICF

In [10]:
from sklearn.naive_bayes import MultinomialNB

# Initialize the Naive Bayes classifier
clf = MultinomialNB()

# Fit the classifier on the training data
clf.fit(X_train, y_train)

# Predict the categories of the testing data
y_pred = clf.predict(X_test)
y_pred

array(['business', 'politics', 'politics', 'politics', 'tech', 'business',
       'tech', 'entertainment', 'business', 'tech', 'politics',
       'entertainment', 'business', 'sport', 'tech', 'politics',
       'business', 'sport', 'politics', 'business', 'tech',
       'entertainment', 'sport', 'business', 'sport', 'tech', 'business',
       'sport', 'tech', 'business', 'sport', 'business', 'business',
       'politics', 'sport', 'tech', 'politics', 'business',
       'entertainment', 'tech', 'sport', 'business', 'politics',
       'business', 'sport', 'business', 'politics', 'tech', 'politics',
       'business', 'sport', 'sport', 'business', 'sport', 'politics',
       'entertainment', 'entertainment', 'politics', 'business',
       'entertainment', 'tech', 'business', 'politics', 'tech', 'tech',
       'tech', 'business', 'tech', 'politics', 'politics', 'politics',
       'sport', 'sport', 'sport', 'entertainment', 'entertainment',
       'business', 'politics', 'business', 'busine

In [11]:
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# Calculate the accuracy score
accuracy = accuracy_score(y_test, y_pred)

# Print the accuracy score
print('Accuracy:', accuracy)

# Print the confusion matrix
confusion_matrix1 = confusion_matrix(y_test, y_pred)
print('Confusion matrix:\n', confusion_matrix)


precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# Print the evaluation metrics
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 score: {f1:.4f}')

Accuracy: 0.970917225950783
Confusion matrix:
 <function confusion_matrix at 0x000001CF89169940>
Precision: 0.9718
Recall: 0.9691
F1 score: 0.9702


### Split the dataset into training and testing sets into 80:20

In [12]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(tfidf_df.drop('Category', axis=1), tfidf_df['Category'], test_size=0.2, random_state=42)

In [13]:
# Initialize the Naive Bayes classifier
clf2 = MultinomialNB()

# Fit the classifier on the training data
clf2.fit(X_train2, y_train2)

# Predict the categories of the testing data
y_pred2 = clf2.predict(X_test2)
y_pred2

array(['business', 'politics', 'politics', 'politics', 'sport',
       'business', 'tech', 'entertainment', 'business', 'tech',
       'politics', 'entertainment', 'business', 'sport', 'tech',
       'politics', 'business', 'sport', 'politics', 'business', 'tech',
       'entertainment', 'sport', 'business', 'sport', 'tech', 'business',
       'sport', 'tech', 'business', 'sport', 'business', 'business',
       'politics', 'sport', 'tech', 'politics', 'business',
       'entertainment', 'tech', 'sport', 'business', 'politics',
       'business', 'sport', 'business', 'politics', 'tech', 'politics',
       'business', 'sport', 'sport', 'business', 'sport', 'politics',
       'entertainment', 'entertainment', 'politics', 'business',
       'entertainment', 'tech', 'business', 'politics', 'tech', 'tech',
       'tech', 'business', 'tech', 'politics', 'politics', 'politics',
       'sport', 'sport', 'sport', 'entertainment', 'entertainment',
       'business', 'politics', 'business', 'busin

In [14]:
from sklearn.metrics import accuracy_score, confusion_matrix
accuracy2 = accuracy_score(y_test2, y_pred2)

# Print the accuracy score
print('Accuracy:', accuracy2)

# Print the confusion matrix
confusion_matrix2 = confusion_matrix(y_test2, y_pred2)
print('Confusion matrix:\n', confusion_matrix2)


precision2 = precision_score(y_test2, y_pred2, average='macro')
recall2 = recall_score(y_test2, y_pred2, average='macro')
f1_second = f1_score(y_test2, y_pred2, average='macro')

# Print the evaluation metrics
print(f'Precision: {precision2:.4f}')
print(f'Recall: {recall2:.4f}')
print(f'F1 score: {f1_second:.4f}')

Accuracy: 0.959731543624161
Confusion matrix:
 [[73  0  1  0  1]
 [ 1 44  1  0  0]
 [ 2  0 53  1  0]
 [ 0  0  0 63  0]
 [ 1  0  3  1 53]]
Precision: 0.9625
Recall: 0.9580
F1 score: 0.9598


### Split the dataset into training and testing sets into 60:40


In [15]:
X_train3, X_test3, y_train3, y_test3 = train_test_split(tfidf_df.drop('Category', axis=1), tfidf_df['Category'], test_size=0.4, random_state=42)

In [16]:
# Initialize the Naive Bayes classifier
clf3 = MultinomialNB()

# Fit the classifier on the training data
clf3.fit(X_train3, y_train3)

# Predict the categories of the testing data
y_pred3 = clf3.predict(X_test3)
y_pred3

array(['business', 'politics', 'politics', 'politics', 'sport',
       'business', 'tech', 'entertainment', 'business', 'tech',
       'politics', 'entertainment', 'business', 'sport', 'tech',
       'politics', 'business', 'sport', 'politics', 'business', 'tech',
       'entertainment', 'sport', 'business', 'sport', 'tech', 'business',
       'sport', 'tech', 'business', 'sport', 'business', 'business',
       'politics', 'sport', 'tech', 'politics', 'business',
       'entertainment', 'business', 'sport', 'business', 'politics',
       'business', 'sport', 'business', 'politics', 'tech', 'politics',
       'business', 'sport', 'sport', 'business', 'sport', 'politics',
       'entertainment', 'entertainment', 'politics', 'business',
       'entertainment', 'tech', 'business', 'politics', 'tech', 'tech',
       'tech', 'business', 'tech', 'politics', 'politics', 'politics',
       'sport', 'sport', 'sport', 'entertainment', 'entertainment',
       'business', 'politics', 'business', 'b

In [17]:
accuracy3 = accuracy_score(y_test3, y_pred3)

# Print the accuracy score
print('Accuracy:', accuracy3)

# Print the confusion matrix
confusion_matrix3 = confusion_matrix(y_test3, y_pred3)
print('Confusion matrix:\n', confusion_matrix3)


precision3 = precision_score(y_test3, y_pred3, average='macro')
recall3 = recall_score(y_test3, y_pred3, average='macro')
f1_third = f1_score(y_test3, y_pred3, average='macro')

# Print the evaluation metrics
print(f'Precision: {precision3:.4f}')
print(f'Recall: {recall3:.4f}')
print(f'F1 score: {f1_third:.4f}')

Accuracy: 0.9580536912751678
Confusion matrix:
 [[134   0   1   0   2]
 [  1 102   2   4   0]
 [  2   0 104   3   0]
 [  0   0   0 129   0]
 [  2   0   5   3 102]]
Precision: 0.9603
Recall: 0.9557
F1 score: 0.9572


### Split the dataset into training and testing sets into 50:50

In [18]:
X_train4, X_test4, y_train4, y_test4 = train_test_split(tfidf_df.drop('Category', axis=1), tfidf_df['Category'], test_size=0.5, random_state=42)

In [19]:
# Initialize the Naive Bayes classifier
clf4 = MultinomialNB()

# Fit the classifier on the training data
clf4.fit(X_train4, y_train4)

# Predict the categories of the testing data
y_pred4 = clf4.predict(X_test4)
y_pred4

array(['business', 'politics', 'politics', 'politics', 'sport',
       'business', 'tech', 'entertainment', 'business', 'tech',
       'politics', 'entertainment', 'business', 'sport', 'tech',
       'politics', 'business', 'sport', 'politics', 'business', 'tech',
       'entertainment', 'sport', 'business', 'sport', 'tech', 'business',
       'sport', 'tech', 'business', 'sport', 'business', 'business',
       'politics', 'sport', 'tech', 'politics', 'business',
       'entertainment', 'business', 'sport', 'business', 'politics',
       'business', 'sport', 'business', 'politics', 'tech', 'politics',
       'business', 'sport', 'sport', 'business', 'sport', 'politics',
       'entertainment', 'entertainment', 'politics', 'business',
       'entertainment', 'tech', 'business', 'entertainment', 'tech',
       'tech', 'tech', 'business', 'tech', 'politics', 'politics',
       'politics', 'sport', 'sport', 'sport', 'entertainment',
       'entertainment', 'business', 'politics', 'business

In [20]:
accuracy4 = accuracy_score(y_test4, y_pred4)

# Print the accuracy score
print('Accuracy:', accuracy4)

# Print the confusion matrix
confusion_matrix4 = confusion_matrix(y_test4, y_pred4)
print('Confusion matrix:\n', confusion_matrix4)


precision4 = precision_score(y_test4, y_pred4, average='macro')
recall4 = recall_score(y_test4, y_pred4, average='macro')
f1_fourth = f1_score(y_test4, y_pred4, average='macro')

# Print the evaluation metrics
print(f'Precision: {precision4:.4f}')
print(f'Recall: {recall4:.4f}')
print(f'F1 score: {f1_fourth:.4f}')

Accuracy: 0.9651006711409396
Confusion matrix:
 [[168   0   1   0   2]
 [  1 124   0   5   0]
 [  4   0 134   4   0]
 [  0   0   0 164   0]
 [  3   0   3   3 129]]
Precision: 0.9684
Recall: 0.9629
F1 score: 0.9651


In [21]:
!pip install PrettyTable





In [22]:
# Comparision among different splits
from prettytable import PrettyTable

myTable = PrettyTable()

myTable.title='Comparison Among different Splits'
myTable.add_column('Splits', ['70:30', '80:20', '60:40', '50:50'])
myTable.add_column('Accuracy', [accuracy, accuracy2, accuracy3, accuracy4])
myTable.add_column('Precision', [precision, precision2, precision3, precision4])
myTable.add_column('Recall', [recall, recall2, recall3, recall4])
myTable.add_column('F1-score', [f1, f1_second, f1_third, f1_fourth])

print(myTable)

+--------------------------------------------------------------------------------------------+
|                             Comparison Among different Splits                              |
+--------+--------------------+--------------------+--------------------+--------------------+
| Splits |      Accuracy      |     Precision      |       Recall       |      F1-score      |
+--------+--------------------+--------------------+--------------------+--------------------+
| 70:30  | 0.970917225950783  | 0.9717560217560217 | 0.9691219398977251 | 0.970230855958358  |
| 80:20  | 0.959731543624161  | 0.9625114604424949 | 0.9580153494681231 | 0.9597864452798662 |
| 60:40  | 0.9580536912751678 | 0.9602853980551822 | 0.9557449464752084 | 0.9572293239000491 |
| 50:50  | 0.9651006711409396 | 0.968422190708948  | 0.9629493749447338 | 0.965126842151743  |
+--------+--------------------+--------------------+--------------------+--------------------+


In [23]:
# Confusion matrices of all splits
print('Confusion matrices:\n')
print('70:30\n ', confusion_matrix1)
print('80:20\n ',confusion_matrix2)
print('60:40\n ', confusion_matrix3)
print('50:50\n ', confusion_matrix4)

Confusion matrices:

70:30
  [[105   0   1   0   2]
 [  1  75   1   2   0]
 [  2   0  83   1   0]
 [  0   0   0 101   0]
 [  0   0   3   0  70]]
80:20
  [[73  0  1  0  1]
 [ 1 44  1  0  0]
 [ 2  0 53  1  0]
 [ 0  0  0 63  0]
 [ 1  0  3  1 53]]
60:40
  [[134   0   1   0   2]
 [  1 102   2   4   0]
 [  2   0 104   3   0]
 [  0   0   0 129   0]
 [  2   0   5   3 102]]
50:50
  [[168   0   1   0   2]
 [  1 124   0   5   0]
 [  4   0 134   4   0]
 [  0   0   0 164   0]
 [  3   0   3   3 129]]


In [24]:
# Calculate the frequency of each category in the training set
category_freq = y_train.value_counts(normalize=True)

# Print the frequency of each category
print('Category frequency:\n', category_freq)

Category frequency:
 sport            0.234899
business         0.218600
entertainment    0.186002
tech             0.180249
politics         0.180249
Name: Category, dtype: float64


In [25]:
data['Text']

0       worldcom ex-boss launch defence lawyer defendi...
1       german business confidence slide german busine...
2       bbc poll indicates economic gloom citizen majo...
3       lifestyle governs mobile choice faster better ...
4       enron boss 168m payout eighteen former enron d...
                              ...                        
1485    double eviction big brother model caprice holb...
1486    dj double act revamp chart show dj duo jk joel...
1487    weak dollar hit reuters revenue medium group r...
1488    apple ipod family expands market apple expande...
1489    santy worm make unwelcome visit thousand websi...
Name: Text, Length: 1490, dtype: object

In [26]:
y_train

701     entertainment
1142             tech
490              tech
10           politics
147          business
            ...      
1130         politics
1294         business
860          politics
1459    entertainment
1126            sport
Name: Category, Length: 1043, dtype: object

In [27]:
X_train

Unnamed: 0,00,000,0001,000bn,000m,000th,001,001and,001st,0051,...,zombie,zone,zonealarm,zoom,zooropa,zorro,zuluaga,zurich,zutons,zvonareva
701,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1142,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
490,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
147,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1130,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1294,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
860,0.0,0.031895,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1459,0.0,0.036481,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
# Initialize the TF-IDF vectorizer
tfidf = TfidfVectorizer()

# Calculate the TF-IDF values for each feature in the training set
X_train_tfidf = tfidf.fit_transform(data['Text'])

# Loop over the unique categories in the training set
for category in y_train.unique():
    # Get the indices of the training samples that belong to this category
    category_indices = y_train[y_train == category].index
    # Calculate the average TF-IDF value for each feature in documents belonging to this category
    avg_tfidf = X_train_tfidf[category_indices].mean(axis=0)
    # Convert the average TF-IDF value to a 1D array
    avg_tfidf = np.squeeze(np.asarray(avg_tfidf))
    # Print the feature names and their corresponding average TF-IDF values
    feature_names = tfidf.get_feature_names()
    for feature_idx in np.argsort(avg_tfidf)[-10:]:
        feature_name = feature_names[feature_idx]
        feature_tfidf = avg_tfidf[feature_idx]
        print(f'{category}: {feature_name} - {feature_tfidf:.4f}')

entertainment: said - 0.0279
entertainment: oscar - 0.0284
entertainment: music - 0.0294
entertainment: band - 0.0312
entertainment: actor - 0.0317
entertainment: star - 0.0327
entertainment: show - 0.0343
entertainment: best - 0.0455
entertainment: award - 0.0459
entertainment: film - 0.0758
tech: computer - 0.0310
tech: software - 0.0318
tech: user - 0.0323
tech: service - 0.0352
tech: technology - 0.0353
tech: game - 0.0354
tech: said - 0.0354
tech: people - 0.0403
tech: phone - 0.0471
tech: mobile - 0.0568
politics: minister - 0.0390
politics: tory - 0.0399
politics: would - 0.0409
politics: government - 0.0424
politics: party - 0.0458
politics: blair - 0.0518
politics: labour - 0.0519
politics: said - 0.0601
politics: election - 0.0603
politics: mr - 0.0796
business: mr - 0.0249
business: share - 0.0260
business: market - 0.0263
business: year - 0.0267
business: economy - 0.0273
business: growth - 0.0273
business: bank - 0.0277
business: company - 0.0311
business: firm - 0.0331
bu



In [29]:
data2 = df_train.drop(columns=['ArticleId'])

In [30]:
# Create a list of stopwords and punctuation marks
stopwords_list = stopwords.words('english') + list(string.punctuation)

# Initialize the WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# function to clean and tokenize the text
def clean_tokenize_text(text):
    # Convert the text to lowercase
    text = text.lower()

    # Tokenize the text
    words = word_tokenize(text)

    # Remove the stopwords and punctuation marks
    words = [word for word in words if word not in stopwords_list]

    # Lemmatize the words
    words = [lemmatizer.lemmatize(word) for word in words]

    return words

# Apply the clean_tokenize_text function to the 'Text' column
data2['Text'] = data2['Text'].apply(clean_tokenize_text)

### Using Ngram-feature

In [31]:
ngram_range = (1,2)

# Convert the list of words back to a string for the TfidfVectorizer
data2['Text'] = data2['Text'].apply(' '.join)

# Initialize the TfidfVectorizer
vectorizer2 = TfidfVectorizer(ngram_range = ngram_range)

# Fit and transform the vectorizer on the 'Text' column
tfidf_matrix2 = vectorizer2.fit_transform(data2['Text'])

# Convert the sparse matrix to a dense matrix
tfidf_matrix2 = tfidf_matrix2.toarray()

# Create a DataFrame from the dense matrix
tfidf_df2 = pd.DataFrame(tfidf_matrix2, columns=vectorizer2.get_feature_names())

# Add the 'Category' column to the DataFrame
tfidf_df2['Category'] = data2['Category']

# Group the DataFrame by category and calculate the mean of each feature
tficf_df2 = tfidf_df2.groupby('Category').mean()



In [32]:
X_train5, X_test5, y_train5, y_test5 = train_test_split(tfidf_df2.drop('Category', axis=1), tfidf_df2['Category'], test_size=0.3, random_state=42)

In [33]:
# Initialize the Naive Bayes classifier
clf5 = MultinomialNB()

# Fit the classifier on the training data
clf5.fit(X_train5, y_train5)

# Predict the categories of the testing data
y_pred5 = clf5.predict(X_test5)

In [34]:
accuracy5 = accuracy_score(y_test5, y_pred5)

# Print the accuracy score
print('Accuracy:', accuracy5)

# Print the confusion matrix
confusion_matrix5 = confusion_matrix(y_test5, y_pred5)
print('Confusion matrix:\n', confusion_matrix5)


precision5 = precision_score(y_test5, y_pred5, average='macro')
recall5 = recall_score(y_test5, y_pred5, average='macro')
f1_fifth = f1_score(y_test5, y_pred5, average='macro')

# Print the evaluation metrics
print(f'Precision: {precision5:.4f}')
print(f'Recall: {recall5:.4f}')
print(f'F1 score: {f1_fifth:.4f}')

Accuracy: 0.9574944071588367
Confusion matrix:
 [[104   0   2   0   2]
 [  2  72   0   5   0]
 [  2   0  81   1   2]
 [  0   0   0 101   0]
 [  0   0   2   1  70]]
Precision: 0.9594
Recall: 0.9550
F1 score: 0.9566


In [35]:
# Comparision among different splits
myTable1 = PrettyTable()

myTable1.title='Comparison Among different Splits'
myTable1.add_column('Splits', ['70:30', '80:20', '60:40', '50:50', 'N-gram'])
myTable1.add_column('Accuracy', [accuracy, accuracy2, accuracy3, accuracy4, accuracy5])
myTable1.add_column('Precision', [precision, precision2, precision3, precision4, precision5])
myTable1.add_column('Recall', [recall, recall2, recall3, recall4, recall5])
myTable1.add_column('F1-score', [f1, f1_second, f1_third, f1_fourth, f1_fifth])

print(myTable1)

+--------------------------------------------------------------------------------------------+
|                             Comparison Among different Splits                              |
+--------+--------------------+--------------------+--------------------+--------------------+
| Splits |      Accuracy      |     Precision      |       Recall       |      F1-score      |
+--------+--------------------+--------------------+--------------------+--------------------+
| 70:30  | 0.970917225950783  | 0.9717560217560217 | 0.9691219398977251 | 0.970230855958358  |
| 80:20  | 0.959731543624161  | 0.9625114604424949 | 0.9580153494681231 | 0.9597864452798662 |
| 60:40  | 0.9580536912751678 | 0.9602853980551822 | 0.9557449464752084 | 0.9572293239000491 |
| 50:50  | 0.9651006711409396 | 0.968422190708948  | 0.9629493749447338 | 0.965126842151743  |
| N-gram | 0.9574944071588367 | 0.9594070541129364 | 0.9550239885463148 |  0.9565723795072   |
+--------+--------------------+-------------------

In [36]:
# Confusion matrices of all splits
print('Confusion matrices:\n')
print('70:30\n ', confusion_matrix1)
print('80:20\n ',confusion_matrix2)
print('60:40\n ', confusion_matrix3)
print('50:50\n ', confusion_matrix4)
print('N-gram\n ', confusion_matrix5)

Confusion matrices:

70:30
  [[105   0   1   0   2]
 [  1  75   1   2   0]
 [  2   0  83   1   0]
 [  0   0   0 101   0]
 [  0   0   3   0  70]]
80:20
  [[73  0  1  0  1]
 [ 1 44  1  0  0]
 [ 2  0 53  1  0]
 [ 0  0  0 63  0]
 [ 1  0  3  1 53]]
60:40
  [[134   0   1   0   2]
 [  1 102   2   4   0]
 [  2   0 104   3   0]
 [  0   0   0 129   0]
 [  2   0   5   3 102]]
50:50
  [[168   0   1   0   2]
 [  1 124   0   5   0]
 [  4   0 134   4   0]
 [  0   0   0 164   0]
 [  3   0   3   3 129]]
N-gram
  [[104   0   2   0   2]
 [  2  72   0   5   0]
 [  2   0  81   1   2]
 [  0   0   0 101   0]
 [  0   0   2   1  70]]
