In [1]:
import pandas as pd

The data was captured in the kaggle from: https://www.kaggle.com/datasets/athu1105/book-genre-prediction?resource=download

In [2]:
Data_frame = pd.read_csv('data.csv')
Data_frame

Unnamed: 0,index,title,genre,summary
0,0,Drowned Wednesday,fantasy,Drowned Wednesday is the first Trustee among ...
1,1,The Lost Hero,fantasy,"As the book opens, Jason awakens on a school ..."
2,2,The Eyes of the Overworld,fantasy,Cugel is easily persuaded by the merchant Fia...
3,3,Magic's Promise,fantasy,The book opens with Herald-Mage Vanyel return...
4,4,Taran Wanderer,fantasy,Taran and Gurgi have returned to Caer Dallben...
...,...,...,...,...
4652,4652,Hounded,fantasy,"Atticus O’Sullivan, last of the Druids, lives ..."
4653,4653,Charlie and the Chocolate Factory,fantasy,Charlie Bucket's wonderful adventure begins wh...
4654,4654,Red Rising,fantasy,"""I live for the dream that my children will be..."
4655,4655,Frostbite,fantasy,"Rose loves Dimitri, Dimitri might love Tasha, ..."


In [3]:
Data_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4657 entries, 0 to 4656
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   index    4657 non-null   int64 
 1   title    4657 non-null   object
 2   genre    4657 non-null   object
 3   summary  4657 non-null   object
dtypes: int64(1), object(3)
memory usage: 145.7+ KB


Based on above cell we have 4657 rows, 4 columns representing index, title, genre and summary. It is mentioned that there are no null values but just in case we will also check it in the following cells

EDA and NLP Aug is for augmentation of imbalanced data

In [4]:
def finding_missing_data(data_frame, column):
    data_frame_null = data_frame[column].isnull()
    data_frame_nan = data_frame[column].isna()
    missing_null = data_frame_null.sum()
    missing_nan = data_frame_nan.sum()
    print('number of null values in %s is %i' % (column, missing_null))
    print('number of na values in %s is %i' % (column, missing_nan))


In [5]:
attrs = ['title', 'genre', 'summary']
for attr in attrs:
    finding_missing_data(Data_frame, attr)

number of null values in title is 0
number of na values in title is 0
number of null values in genre is 0
number of na values in genre is 0
number of null values in summary is 0
number of na values in summary is 0


In [6]:
genre_counts = Data_frame['genre'].value_counts()
genre_counts

genre
thriller      1023
fantasy        876
science        647
history        600
horror         600
crime          500
romance        111
psychology     100
sports         100
travel         100
Name: count, dtype: int64

Base on the above cell I have 10 genres, number of each instances is also provided and we can find that out data is imbalance.
        

In [7]:
Data_frame.nunique()

index      4657
title      4296
genre        10
summary    4542
dtype: int64

Based on the above cell we can found out that there are some repetitive titles and summary by comparing these 2 row unique  numbers with the index. therefore, we have 361 same titles and 115 same  summary. In case of titles, if the summary is diffrent I do not think that it would be a problem.

In [8]:
import pandas as pd

# Find duplicate titles
duplicate_titles_count = Data_frame['title'].value_counts()

# Filter out titles with counts greater than 1
non_unique_titles = duplicate_titles_count[duplicate_titles_count > 1]

# Convert Series to DataFrame
non_unique_titles_df = pd.DataFrame({'title': non_unique_titles.index, 'count': non_unique_titles.values})

non_unique_titles_df

Unnamed: 0,title,count
0,Bloodline,4
1,Nemesis,4
2,The Hunger Games,4
3,The Enemy,3
4,Stone Cold,3
...,...,...
314,Insurgent,2
315,Speaker for the Dead,2
316,Harry Potter and the Order of the Phoenix,2
317,1st to Die,2


In [9]:
import pandas as pd

# Initialize an empty list to store the data
title_index_pairs = []

# Iterate over non-unique titles
for title in non_unique_titles.index:
    # Filter DataFrame for the current title
    title_df = Data_frame[Data_frame['title'] == title]
    
    # Extract unique summaries for the current title
    unique_summaries = title_df['summary'].unique()
    
    # Check if there is only one unique summary
    if len(unique_summaries) == 1:
        # Get the indices of the instances with the same summary
        indices = title_df.index.tolist()
        
        # Append the title and its corresponding indices to the list
        title_index_pairs.append({'title': title, 'indices': indices})

# Create a new DataFrame from the list
titles_indices_df = pd.DataFrame(title_index_pairs)


titles_indices_df


Unnamed: 0,title,indices
0,The Radium Girls: The Dark Story of America's ...,"[3037, 3038, 3502]"
1,Divergent,"[3330, 4424]"
2,The Cabin at the End of the World,"[3137, 3854]"
3,"Guns, Germs, and Steel: The Fates of Human Soc...","[3001, 3429]"
4,Rose Madder,"[3142, 4091]"
...,...,...
79,The Da Vinci Code,"[3351, 3660]"
80,Gone Girl,"[3349, 3658]"
81,Kulti,"[3365, 3561]"
82,Insurgent,"[3367, 4560]"


In [10]:
# Update the 'indices' column to keep all elements except the first one in each list
titles_indices_df['indices'] = titles_indices_df['indices'].apply(lambda x: x[1:])

# Display the modified DataFrame
titles_indices_df


Unnamed: 0,title,indices
0,The Radium Girls: The Dark Story of America's ...,"[3038, 3502]"
1,Divergent,[4424]
2,The Cabin at the End of the World,[3854]
3,"Guns, Germs, and Steel: The Fates of Human Soc...",[3429]
4,Rose Madder,[4091]
...,...,...
79,The Da Vinci Code,[3660]
80,Gone Girl,[3658]
81,Kulti,[3561]
82,Insurgent,[4560]


In [11]:
nondup_df = Data_frame.copy()


In [12]:
# Iterate over each row in titles_indices_df
for _, row in titles_indices_df.iterrows():
    # Extract indices from the current row
    indices_to_drop = row['indices']
    
    # Drop rows from nondup_df based on the indices
    nondup_df = nondup_df.drop(indices_to_drop)

# Reset the index of the modified DataFrame
nondup_df = nondup_df.reset_index(drop=True)

nondup_df


Unnamed: 0,index,title,genre,summary
0,0,Drowned Wednesday,fantasy,Drowned Wednesday is the first Trustee among ...
1,1,The Lost Hero,fantasy,"As the book opens, Jason awakens on a school ..."
2,2,The Eyes of the Overworld,fantasy,Cugel is easily persuaded by the merchant Fia...
3,3,Magic's Promise,fantasy,The book opens with Herald-Mage Vanyel return...
4,4,Taran Wanderer,fantasy,Taran and Gurgi have returned to Caer Dallben...
...,...,...,...,...
4567,4652,Hounded,fantasy,"Atticus O’Sullivan, last of the Druids, lives ..."
4568,4653,Charlie and the Chocolate Factory,fantasy,Charlie Bucket's wonderful adventure begins wh...
4569,4654,Red Rising,fantasy,"""I live for the dream that my children will be..."
4570,4655,Frostbite,fantasy,"Rose loves Dimitri, Dimitri might love Tasha, ..."


In [13]:
nondup_df.to_csv("nondup.csv")

In [14]:
nondup_df = pd.read_csv("nondup.csv")

In [15]:
nondup_df

Unnamed: 0.1,Unnamed: 0,index,title,genre,summary
0,0,0,Drowned Wednesday,fantasy,Drowned Wednesday is the first Trustee among ...
1,1,1,The Lost Hero,fantasy,"As the book opens, Jason awakens on a school ..."
2,2,2,The Eyes of the Overworld,fantasy,Cugel is easily persuaded by the merchant Fia...
3,3,3,Magic's Promise,fantasy,The book opens with Herald-Mage Vanyel return...
4,4,4,Taran Wanderer,fantasy,Taran and Gurgi have returned to Caer Dallben...
...,...,...,...,...,...
4567,4567,4652,Hounded,fantasy,"Atticus O’Sullivan, last of the Druids, lives ..."
4568,4568,4653,Charlie and the Chocolate Factory,fantasy,Charlie Bucket's wonderful adventure begins wh...
4569,4569,4654,Red Rising,fantasy,"""I live for the dream that my children will be..."
4570,4570,4655,Frostbite,fantasy,"Rose loves Dimitri, Dimitri might love Tasha, ..."


In [16]:
genre_counts_1 = nondup_df['genre'].value_counts()
genre_counts_1

genre
thriller      998
fantasy       860
science       626
horror        600
history       599
crime         500
romance       111
psychology     99
travel         98
sports         81
Name: count, dtype: int64

After changes we can see the genres are like this, and thhe number of rows are 4572. The column represent index, title, genre and summary

In [17]:
# Convert all words in the 'summary' column to lowercase
#our summary typed are object not strings
nondup_df['summary'] = nondup_df['summary'].str.lower()

nondup_df


Unnamed: 0.1,Unnamed: 0,index,title,genre,summary
0,0,0,Drowned Wednesday,fantasy,drowned wednesday is the first trustee among ...
1,1,1,The Lost Hero,fantasy,"as the book opens, jason awakens on a school ..."
2,2,2,The Eyes of the Overworld,fantasy,cugel is easily persuaded by the merchant fia...
3,3,3,Magic's Promise,fantasy,the book opens with herald-mage vanyel return...
4,4,4,Taran Wanderer,fantasy,taran and gurgi have returned to caer dallben...
...,...,...,...,...,...
4567,4567,4652,Hounded,fantasy,"atticus o’sullivan, last of the druids, lives ..."
4568,4568,4653,Charlie and the Chocolate Factory,fantasy,charlie bucket's wonderful adventure begins wh...
4569,4569,4654,Red Rising,fantasy,"""i live for the dream that my children will be..."
4570,4570,4655,Frostbite,fantasy,"rose loves dimitri, dimitri might love tasha, ..."


In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

nltk.download('stopwords')

# Create a tokenizer that matches word characters
tokenizer_regex = RegexpTokenizer(r'\w+')

# Define a function to remove stopwords, punctuation, and lowercase the text
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    # Lowercase the text
    text = text.lower()
    # Tokenize the text and remove punctuation
    tokens_without_punctuation = tokenizer_regex.tokenize(text)
    # Filter out stopwords
    filtered_tokens = [word for word in tokens_without_punctuation if word.lower() not in stop_words]
    return ' '.join(filtered_tokens)


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/students/arazz002/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
import pandas as pd
nondup_df = pd.read_csv("nondup.csv")

In [20]:
nondup_df['modified'] = nondup_df['summary'].apply(preprocess_text)
nondup_df

Unnamed: 0.1,Unnamed: 0,index,title,genre,summary,modified
0,0,0,Drowned Wednesday,fantasy,Drowned Wednesday is the first Trustee among ...,drowned wednesday first trustee among morrow d...
1,1,1,The Lost Hero,fantasy,"As the book opens, Jason awakens on a school ...",book opens jason awakens school bus unable rem...
2,2,2,The Eyes of the Overworld,fantasy,Cugel is easily persuaded by the merchant Fia...,cugel easily persuaded merchant fianosther att...
3,3,3,Magic's Promise,fantasy,The book opens with Herald-Mage Vanyel return...,book opens herald mage vanyel returning countr...
4,4,4,Taran Wanderer,fantasy,Taran and Gurgi have returned to Caer Dallben...,taran gurgi returned caer dallben following ev...
...,...,...,...,...,...,...
4567,4567,4652,Hounded,fantasy,"Atticus O’Sullivan, last of the Druids, lives ...",atticus sullivan last druids lives peacefully ...
4568,4568,4653,Charlie and the Chocolate Factory,fantasy,Charlie Bucket's wonderful adventure begins wh...,charlie bucket wonderful adventure begins find...
4569,4569,4654,Red Rising,fantasy,"""I live for the dream that my children will be...",live dream children born free says like land f...
4570,4570,4655,Frostbite,fantasy,"Rose loves Dimitri, Dimitri might love Tasha, ...",rose loves dimitri dimitri might love tasha ma...


In [21]:
lemma_df = nondup_df.copy()

In [22]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import pandas as pd

nltk.download('punkt')
nltk.download('wordnet')

# lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to lemmatize text
def lemmatize_text(text):
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemmatized_tokens)

# Apply lemmatization to the "modified" column
lemma_df['modified_lemmatized'] = lemma_df['modified'].apply(lemmatize_text)

lemma_df


[nltk_data] Downloading package punkt to
[nltk_data]     /home/students/arazz002/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/students/arazz002/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0.1,Unnamed: 0,index,title,genre,summary,modified,modified_lemmatized
0,0,0,Drowned Wednesday,fantasy,Drowned Wednesday is the first Trustee among ...,drowned wednesday first trustee among morrow d...,drowned wednesday first trustee among morrow d...
1,1,1,The Lost Hero,fantasy,"As the book opens, Jason awakens on a school ...",book opens jason awakens school bus unable rem...,book open jason awakens school bus unable reme...
2,2,2,The Eyes of the Overworld,fantasy,Cugel is easily persuaded by the merchant Fia...,cugel easily persuaded merchant fianosther att...,cugel easily persuaded merchant fianosther att...
3,3,3,Magic's Promise,fantasy,The book opens with Herald-Mage Vanyel return...,book opens herald mage vanyel returning countr...,book open herald mage vanyel returning country...
4,4,4,Taran Wanderer,fantasy,Taran and Gurgi have returned to Caer Dallben...,taran gurgi returned caer dallben following ev...,taran gurgi returned caer dallben following ev...
...,...,...,...,...,...,...,...
4567,4567,4652,Hounded,fantasy,"Atticus O’Sullivan, last of the Druids, lives ...",atticus sullivan last druids lives peacefully ...,atticus sullivan last druid life peacefully ar...
4568,4568,4653,Charlie and the Chocolate Factory,fantasy,Charlie Bucket's wonderful adventure begins wh...,charlie bucket wonderful adventure begins find...,charlie bucket wonderful adventure begin find ...
4569,4569,4654,Red Rising,fantasy,"""I live for the dream that my children will be...",live dream children born free says like land f...,live dream child born free say like land fathe...
4570,4570,4655,Frostbite,fantasy,"Rose loves Dimitri, Dimitri might love Tasha, ...",rose loves dimitri dimitri might love tasha ma...,rose love dimitri dimitri might love tasha mas...


In [23]:
lemma_df.to_csv("lemma_df.csv")

In [24]:
lemma_df = pd.read_csv("lemma_df.csv")

In [25]:
tokenized_df = lemma_df.copy()

In [26]:
from nltk.tokenize import word_tokenize

In [27]:
tokenized_df['word_tokenized_summary'] = tokenized_df['modified_lemmatized'].apply(word_tokenize)
tokenized_df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,index,title,genre,summary,modified,modified_lemmatized,word_tokenized_summary
0,0,0,0,Drowned Wednesday,fantasy,Drowned Wednesday is the first Trustee among ...,drowned wednesday first trustee among morrow d...,drowned wednesday first trustee among morrow d...,"[drowned, wednesday, first, trustee, among, mo..."
1,1,1,1,The Lost Hero,fantasy,"As the book opens, Jason awakens on a school ...",book opens jason awakens school bus unable rem...,book open jason awakens school bus unable reme...,"[book, open, jason, awakens, school, bus, unab..."
2,2,2,2,The Eyes of the Overworld,fantasy,Cugel is easily persuaded by the merchant Fia...,cugel easily persuaded merchant fianosther att...,cugel easily persuaded merchant fianosther att...,"[cugel, easily, persuaded, merchant, fianosthe..."
3,3,3,3,Magic's Promise,fantasy,The book opens with Herald-Mage Vanyel return...,book opens herald mage vanyel returning countr...,book open herald mage vanyel returning country...,"[book, open, herald, mage, vanyel, returning, ..."
4,4,4,4,Taran Wanderer,fantasy,Taran and Gurgi have returned to Caer Dallben...,taran gurgi returned caer dallben following ev...,taran gurgi returned caer dallben following ev...,"[taran, gurgi, returned, caer, dallben, follow..."
...,...,...,...,...,...,...,...,...,...
4567,4567,4567,4652,Hounded,fantasy,"Atticus O’Sullivan, last of the Druids, lives ...",atticus sullivan last druids lives peacefully ...,atticus sullivan last druid life peacefully ar...,"[atticus, sullivan, last, druid, life, peacefu..."
4568,4568,4568,4653,Charlie and the Chocolate Factory,fantasy,Charlie Bucket's wonderful adventure begins wh...,charlie bucket wonderful adventure begins find...,charlie bucket wonderful adventure begin find ...,"[charlie, bucket, wonderful, adventure, begin,..."
4569,4569,4569,4654,Red Rising,fantasy,"""I live for the dream that my children will be...",live dream children born free says like land f...,live dream child born free say like land fathe...,"[live, dream, child, born, free, say, like, la..."
4570,4570,4570,4655,Frostbite,fantasy,"Rose loves Dimitri, Dimitri might love Tasha, ...",rose loves dimitri dimitri might love tasha ma...,rose love dimitri dimitri might love tasha mas...,"[rose, love, dimitri, dimitri, might, love, ta..."


In [28]:
tokenized_df.to_csv("token.csv")

In [29]:
tokenized_df = pd.read_csv("token.csv")

In [30]:
POS_df = tokenized_df.copy()

In [31]:
import nltk
from nltk.tokenize import word_tokenize
import pandas as pd

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Function to get POS tags for a sentence
def get_pos_tags(sentence):
    tokens = word_tokenize(sentence)
    pos_tags = nltk.pos_tag(tokens)
    return pos_tags

# Apply POS tagging to the "summary" column
POS_df['modified_summary_pos_tags'] = POS_df['modified'].apply(get_pos_tags)

POS_df

[nltk_data] Downloading package punkt to
[nltk_data]     /home/students/arazz002/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/students/arazz002/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,index,title,genre,summary,modified,modified_lemmatized,word_tokenized_summary,modified_summary_pos_tags
0,0,0,0,0,Drowned Wednesday,fantasy,Drowned Wednesday is the first Trustee among ...,drowned wednesday first trustee among morrow d...,drowned wednesday first trustee among morrow d...,"['drowned', 'wednesday', 'first', 'trustee', '...","[(drowned, VBN), (wednesday, NN), (first, JJ),..."
1,1,1,1,1,The Lost Hero,fantasy,"As the book opens, Jason awakens on a school ...",book opens jason awakens school bus unable rem...,book open jason awakens school bus unable reme...,"['book', 'open', 'jason', 'awakens', 'school',...","[(book, NN), (opens, VBZ), (jason, NN), (awake..."
2,2,2,2,2,The Eyes of the Overworld,fantasy,Cugel is easily persuaded by the merchant Fia...,cugel easily persuaded merchant fianosther att...,cugel easily persuaded merchant fianosther att...,"['cugel', 'easily', 'persuaded', 'merchant', '...","[(cugel, NNS), (easily, RB), (persuaded, VBD),..."
3,3,3,3,3,Magic's Promise,fantasy,The book opens with Herald-Mage Vanyel return...,book opens herald mage vanyel returning countr...,book open herald mage vanyel returning country...,"['book', 'open', 'herald', 'mage', 'vanyel', '...","[(book, NN), (opens, VBZ), (herald, JJ), (mage..."
4,4,4,4,4,Taran Wanderer,fantasy,Taran and Gurgi have returned to Caer Dallben...,taran gurgi returned caer dallben following ev...,taran gurgi returned caer dallben following ev...,"['taran', 'gurgi', 'returned', 'caer', 'dallbe...","[(taran, NN), (gurgi, NN), (returned, VBD), (c..."
...,...,...,...,...,...,...,...,...,...,...,...
4567,4567,4567,4567,4652,Hounded,fantasy,"Atticus O’Sullivan, last of the Druids, lives ...",atticus sullivan last druids lives peacefully ...,atticus sullivan last druid life peacefully ar...,"['atticus', 'sullivan', 'last', 'druid', 'life...","[(atticus, NN), (sullivan, NN), (last, JJ), (d..."
4568,4568,4568,4568,4653,Charlie and the Chocolate Factory,fantasy,Charlie Bucket's wonderful adventure begins wh...,charlie bucket wonderful adventure begins find...,charlie bucket wonderful adventure begin find ...,"['charlie', 'bucket', 'wonderful', 'adventure'...","[(charlie, NN), (bucket, NN), (wonderful, JJ),..."
4569,4569,4569,4569,4654,Red Rising,fantasy,"""I live for the dream that my children will be...",live dream children born free says like land f...,live dream child born free say like land fathe...,"['live', 'dream', 'child', 'born', 'free', 'sa...","[(live, JJ), (dream, NN), (children, NNS), (bo..."
4570,4570,4570,4570,4655,Frostbite,fantasy,"Rose loves Dimitri, Dimitri might love Tasha, ...",rose loves dimitri dimitri might love tasha ma...,rose love dimitri dimitri might love tasha mas...,"['rose', 'love', 'dimitri', 'dimitri', 'might'...","[(rose, VBD), (loves, NNS), (dimitri, JJ), (di..."


In [32]:
POS_df.to_csv("POS.csv")

In [33]:
POS_df = pd.read_csv("POS.csv")

In [34]:
onehot_df  = POS_df.copy()

In [35]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

genres = onehot_df['genre']

# Reshape the data to a 2D array
genres_reshaped = genres.values.reshape(-1, 1)

# Initialize the OneHotEncoder
encoder = OneHotEncoder()

# Fit and transform the data
one_hot_encoded = encoder.fit_transform(genres_reshaped)

# Convert the sparse matrix to a dense array
one_hot_encoded_array = one_hot_encoded.toarray()

# Add the combined array as a new column in the DataFrame
onehot_df['genre_encoded'] = list(one_hot_encoded_array)



onehot_df


Unnamed: 0.4,Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,index,title,genre,summary,modified,modified_lemmatized,word_tokenized_summary,modified_summary_pos_tags,genre_encoded
0,0,0,0,0,0,Drowned Wednesday,fantasy,Drowned Wednesday is the first Trustee among ...,drowned wednesday first trustee among morrow d...,drowned wednesday first trustee among morrow d...,"['drowned', 'wednesday', 'first', 'trustee', '...","[('drowned', 'VBN'), ('wednesday', 'NN'), ('fi...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,1,1,1,1,1,The Lost Hero,fantasy,"As the book opens, Jason awakens on a school ...",book opens jason awakens school bus unable rem...,book open jason awakens school bus unable reme...,"['book', 'open', 'jason', 'awakens', 'school',...","[('book', 'NN'), ('opens', 'VBZ'), ('jason', '...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,2,2,2,2,2,The Eyes of the Overworld,fantasy,Cugel is easily persuaded by the merchant Fia...,cugel easily persuaded merchant fianosther att...,cugel easily persuaded merchant fianosther att...,"['cugel', 'easily', 'persuaded', 'merchant', '...","[('cugel', 'NNS'), ('easily', 'RB'), ('persuad...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,3,3,3,3,3,Magic's Promise,fantasy,The book opens with Herald-Mage Vanyel return...,book opens herald mage vanyel returning countr...,book open herald mage vanyel returning country...,"['book', 'open', 'herald', 'mage', 'vanyel', '...","[('book', 'NN'), ('opens', 'VBZ'), ('herald', ...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,4,4,4,4,4,Taran Wanderer,fantasy,Taran and Gurgi have returned to Caer Dallben...,taran gurgi returned caer dallben following ev...,taran gurgi returned caer dallben following ev...,"['taran', 'gurgi', 'returned', 'caer', 'dallbe...","[('taran', 'NN'), ('gurgi', 'NN'), ('returned'...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4567,4567,4567,4567,4567,4652,Hounded,fantasy,"Atticus O’Sullivan, last of the Druids, lives ...",atticus sullivan last druids lives peacefully ...,atticus sullivan last druid life peacefully ar...,"['atticus', 'sullivan', 'last', 'druid', 'life...","[('atticus', 'NN'), ('sullivan', 'NN'), ('last...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4568,4568,4568,4568,4568,4653,Charlie and the Chocolate Factory,fantasy,Charlie Bucket's wonderful adventure begins wh...,charlie bucket wonderful adventure begins find...,charlie bucket wonderful adventure begin find ...,"['charlie', 'bucket', 'wonderful', 'adventure'...","[('charlie', 'NN'), ('bucket', 'NN'), ('wonder...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4569,4569,4569,4569,4569,4654,Red Rising,fantasy,"""I live for the dream that my children will be...",live dream children born free says like land f...,live dream child born free say like land fathe...,"['live', 'dream', 'child', 'born', 'free', 'sa...","[('live', 'JJ'), ('dream', 'NN'), ('children',...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4570,4570,4570,4570,4570,4655,Frostbite,fantasy,"Rose loves Dimitri, Dimitri might love Tasha, ...",rose loves dimitri dimitri might love tasha ma...,rose love dimitri dimitri might love tasha mas...,"['rose', 'love', 'dimitri', 'dimitri', 'might'...","[('rose', 'VBD'), ('loves', 'NNS'), ('dimitri'...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [36]:
onehot_df.to_csv('onehot_df.csv')

In [37]:
onehot_df = pd.read_csv('onehot_df.csv')

Data is imbalanced so I either have to use data augmentation or use k-nearest neighbour or (I guess clusterinh).
Chat GPT suggest that some models inheritly can handle imbalance data such as decision tress, random forest or gradient boosting.

You have to also use ROC curve and F1 score for it

INstead of POS use NER

In [75]:
NER_df = tokenized_df.copy()

In [76]:
import spacy
import spacy.cli
spacy.cli.download("en_core_web_sm")
sp_sm = spacy.load("en_core_web_sm")
def spacy_larg_ner(document):
    return {(ent.text.strip(), ent.label_) for ent in sp_sm(document).ents} 

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [77]:
NER_df['modified_summary_NER_tags'] = NER_df['modified'].apply(spacy_larg_ner)

NER_df

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,index,title,genre,summary,modified,modified_lemmatized,word_tokenized_summary,modified_summary_NER_tags
0,0,0,0,0,Drowned Wednesday,fantasy,Drowned Wednesday is the first Trustee among ...,drowned wednesday first trustee among morrow d...,drowned wednesday first trustee among morrow d...,"['drowned', 'wednesday', 'first', 'trustee', '...","{(later wednesday, TIME), (mantis arthur, PERS..."
1,1,1,1,1,The Lost Hero,fantasy,"As the book opens, Jason awakens on a school ...",book opens jason awakens school bus unable rem...,book open jason awakens school bus unable reme...,"['book', 'open', 'jason', 'awakens', 'school',...","{(hera, ORG), (zeus demigod, PERSON), (wolf, P..."
2,2,2,2,2,The Eyes of the Overworld,fantasy,Cugel is easily persuaded by the merchant Fia...,cugel easily persuaded merchant fianosther att...,cugel easily persuaded merchant fianosther att...,"['cugel', 'easily', 'persuaded', 'merchant', '...","{(1974, DATE), (cugel tongue, ORG), (1983, DAT..."
3,3,3,3,3,Magic's Promise,fantasy,The book opens with Herald-Mage Vanyel return...,book opens herald mage vanyel returning countr...,book open herald mage vanyel returning country...,"['book', 'open', 'herald', 'mage', 'vanyel', '...","{(shay, ORG), (valdemar vanyel, PERSON), (jisa..."
4,4,4,4,4,Taran Wanderer,fantasy,Taran and Gurgi have returned to Caer Dallben...,taran gurgi returned caer dallben following ev...,taran gurgi returned caer dallben following ev...,"['taran', 'gurgi', 'returned', 'caer', 'dallbe...","{(castle llyr, PERSON), (morva, PERSON), (witc..."
...,...,...,...,...,...,...,...,...,...,...,...
4567,4567,4567,4567,4652,Hounded,fantasy,"Atticus O’Sullivan, last of the Druids, lives ...",atticus sullivan last druids lives peacefully ...,atticus sullivan last druid life peacefully ar...,"['atticus', 'sullivan', 'last', 'druid', 'life...","{(irish, NORP), (atticus centuries, ORG), (att..."
4568,4568,4568,4568,4653,Charlie and the Chocolate Factory,fantasy,Charlie Bucket's wonderful adventure begins wh...,charlie bucket wonderful adventure begins find...,charlie bucket wonderful adventure begin find ...,"['charlie', 'bucket', 'wonderful', 'adventure'...","{(one, CARDINAL), (charlie bucket, PERSON)}"
4569,4569,4569,4569,4654,Red Rising,fantasy,"""I live for the dream that my children will be...",live dream children born free says like land f...,live dream child born free say like land fathe...,"['live', 'dream', 'child', 'born', 'free', 'sa...","{(one, CARDINAL), (darrow, DATE)}"
4570,4570,4570,4570,4655,Frostbite,fantasy,"Rose loves Dimitri, Dimitri might love Tasha, ...",rose loves dimitri dimitri might love tasha ma...,rose love dimitri dimitri might love tasha mas...,"['rose', 'love', 'dimitri', 'dimitri', 'might'...","{(tasha mason, ORG), (annual, DATE), (janine h..."


In [78]:
NER_df.to_csv('NER_df.csv')

In [12]:
import pandas as pd

In [13]:
NER_df = pd.read_csv('NER_df.csv')

In [18]:
import spacy
import spacy.cli

# Download the English language model if not already downloaded
spacy.cli.download("en_core_web_sm")

# Load the English language model
sp_sm = spacy.load("en_core_web_sm")

def spacy_larg_ner_1(document):
    # Extract NER tags from the document and return only the labels
    return ' '.join(str(ent.label_) for ent in sp_sm(document).ents)


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [19]:
document = "Apple is looking at buying U.K. startup for $1 billion"
result = spacy_larg_ner_1(document)
print(result)  # Output will be something like: ORG, GPE, MONEY

ORG GPE MONEY


In [20]:
NER_df['modified_summary_NER_tags_1'] = NER_df['modified'].apply(spacy_larg_ner_1)


In [21]:
NER_df

Unnamed: 0.4,Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,index,title,genre,summary,modified,modified_lemmatized,word_tokenized_summary,modified_summary_NER_tags,modified_summary_NER_tags_1
0,0,0,0,0,0,Drowned Wednesday,fantasy,Drowned Wednesday is the first Trustee among ...,drowned wednesday first trustee among morrow d...,drowned wednesday first trustee among morrow d...,"['drowned', 'wednesday', 'first', 'trustee', '...","{('later wednesday', 'TIME'), ('mantis arthur'...",ORDINAL PERSON DATE PERSON DATE CARDINAL PERSO...
1,1,1,1,1,1,The Lost Hero,fantasy,"As the book opens, Jason awakens on a school ...",book opens jason awakens school bus unable rem...,book open jason awakens school bus unable reme...,"['book', 'open', 'jason', 'awakens', 'school',...","{('hera', 'ORG'), ('zeus demigod', 'PERSON'), ...",DATE CARDINAL CARDINAL ORG CARDINAL CARDINAL O...
2,2,2,2,2,2,The Eyes of the Overworld,fantasy,Cugel is easily persuaded by the merchant Fia...,cugel easily persuaded merchant fianosther att...,cugel easily persuaded merchant fianosther att...,"['cugel', 'easily', 'persuaded', 'merchant', '...","{('1974', 'DATE'), ('cugel tongue', 'ORG'), ('...",CARDINAL CARDINAL ORG PERSON DATE DATE
3,3,3,3,3,3,Magic's Promise,fantasy,The book opens with Herald-Mage Vanyel return...,book opens herald mage vanyel returning countr...,book open herald mage vanyel returning country...,"['book', 'open', 'herald', 'mage', 'vanyel', '...","{('shay', 'ORG'), ('valdemar vanyel', 'PERSON'...",PERSON ORG PERSON CARDINAL CARDINAL CARDINAL C...
4,4,4,4,4,4,Taran Wanderer,fantasy,Taran and Gurgi have returned to Caer Dallben...,taran gurgi returned caer dallben following ev...,taran gurgi returned caer dallben following ev...,"['taran', 'gurgi', 'returned', 'caer', 'dallbe...","{('castle llyr', 'PERSON'), ('morva', 'PERSON'...",ORDINAL PERSON DATE CARDINAL DATE ORDINAL PERS...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4567,4567,4567,4567,4567,4652,Hounded,fantasy,"Atticus O’Sullivan, last of the Druids, lives ...",atticus sullivan last druids lives peacefully ...,atticus sullivan last druid life peacefully ar...,"['atticus', 'sullivan', 'last', 'druid', 'life...","{('irish', 'NORP'), ('atticus centuries', 'ORG...",PERSON GPE DATE DATE ORG NORP
4568,4568,4568,4568,4568,4653,Charlie and the Chocolate Factory,fantasy,Charlie Bucket's wonderful adventure begins wh...,charlie bucket wonderful adventure begins find...,charlie bucket wonderful adventure begin find ...,"['charlie', 'bucket', 'wonderful', 'adventure'...","{('one', 'CARDINAL'), ('charlie bucket', 'PERS...",PERSON CARDINAL
4569,4569,4569,4569,4569,4654,Red Rising,fantasy,"""I live for the dream that my children will be...",live dream children born free says like land f...,live dream child born free say like land fathe...,"['live', 'dream', 'child', 'born', 'free', 'sa...","{('one', 'CARDINAL'), ('darrow', 'DATE')}",CARDINAL DATE
4570,4570,4570,4570,4570,4655,Frostbite,fantasy,"Rose loves Dimitri, Dimitri might love Tasha, ...",rose loves dimitri dimitri might love tasha ma...,rose love dimitri dimitri might love tasha mas...,"['rose', 'love', 'dimitri', 'dimitri', 'might'...","{('tasha mason', 'ORG'), ('annual', 'DATE'), (...",ORG DATE PERSON NORP ORG DATE DATE DATE GPE CA...


In [23]:
import pandas as pd

# Replace 'genre' with the actual name of your genre column if it's different
filtered_df = NER_df[NER_df['genre'].isin(['thriller', 'fantasy'])]

filtered_df


Unnamed: 0.4,Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,index,title,genre,summary,modified,modified_lemmatized,word_tokenized_summary,modified_summary_NER_tags,modified_summary_NER_tags_1
0,0,0,0,0,0,Drowned Wednesday,fantasy,Drowned Wednesday is the first Trustee among ...,drowned wednesday first trustee among morrow d...,drowned wednesday first trustee among morrow d...,"['drowned', 'wednesday', 'first', 'trustee', '...","{('later wednesday', 'TIME'), ('mantis arthur'...",ORDINAL PERSON DATE PERSON DATE CARDINAL PERSO...
1,1,1,1,1,1,The Lost Hero,fantasy,"As the book opens, Jason awakens on a school ...",book opens jason awakens school bus unable rem...,book open jason awakens school bus unable reme...,"['book', 'open', 'jason', 'awakens', 'school',...","{('hera', 'ORG'), ('zeus demigod', 'PERSON'), ...",DATE CARDINAL CARDINAL ORG CARDINAL CARDINAL O...
2,2,2,2,2,2,The Eyes of the Overworld,fantasy,Cugel is easily persuaded by the merchant Fia...,cugel easily persuaded merchant fianosther att...,cugel easily persuaded merchant fianosther att...,"['cugel', 'easily', 'persuaded', 'merchant', '...","{('1974', 'DATE'), ('cugel tongue', 'ORG'), ('...",CARDINAL CARDINAL ORG PERSON DATE DATE
3,3,3,3,3,3,Magic's Promise,fantasy,The book opens with Herald-Mage Vanyel return...,book opens herald mage vanyel returning countr...,book open herald mage vanyel returning country...,"['book', 'open', 'herald', 'mage', 'vanyel', '...","{('shay', 'ORG'), ('valdemar vanyel', 'PERSON'...",PERSON ORG PERSON CARDINAL CARDINAL CARDINAL C...
4,4,4,4,4,4,Taran Wanderer,fantasy,Taran and Gurgi have returned to Caer Dallben...,taran gurgi returned caer dallben following ev...,taran gurgi returned caer dallben following ev...,"['taran', 'gurgi', 'returned', 'caer', 'dallbe...","{('castle llyr', 'PERSON'), ('morva', 'PERSON'...",ORDINAL PERSON DATE CARDINAL DATE ORDINAL PERS...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4567,4567,4567,4567,4567,4652,Hounded,fantasy,"Atticus O’Sullivan, last of the Druids, lives ...",atticus sullivan last druids lives peacefully ...,atticus sullivan last druid life peacefully ar...,"['atticus', 'sullivan', 'last', 'druid', 'life...","{('irish', 'NORP'), ('atticus centuries', 'ORG...",PERSON GPE DATE DATE ORG NORP
4568,4568,4568,4568,4568,4653,Charlie and the Chocolate Factory,fantasy,Charlie Bucket's wonderful adventure begins wh...,charlie bucket wonderful adventure begins find...,charlie bucket wonderful adventure begin find ...,"['charlie', 'bucket', 'wonderful', 'adventure'...","{('one', 'CARDINAL'), ('charlie bucket', 'PERS...",PERSON CARDINAL
4569,4569,4569,4569,4569,4654,Red Rising,fantasy,"""I live for the dream that my children will be...",live dream children born free says like land f...,live dream child born free say like land fathe...,"['live', 'dream', 'child', 'born', 'free', 'sa...","{('one', 'CARDINAL'), ('darrow', 'DATE')}",CARDINAL DATE
4570,4570,4570,4570,4570,4655,Frostbite,fantasy,"Rose loves Dimitri, Dimitri might love Tasha, ...",rose loves dimitri dimitri might love tasha ma...,rose love dimitri dimitri might love tasha mas...,"['rose', 'love', 'dimitri', 'dimitri', 'might'...","{('tasha mason', 'ORG'), ('annual', 'DATE'), (...",ORG DATE PERSON NORP ORG DATE DATE DATE GPE CA...


In [24]:
filtered_df.to_csv('filtered_df.csv')

In [2]:
filtered_df = pd.read_csv('filtered_df.csv')

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [4]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
filtered_df.loc[:, 'genre_encoded'] = label_encoder.fit_transform(filtered_df['genre'])


In [29]:
pd.set_option('display.max_rows', None)

In [30]:
filtered_df

Unnamed: 0.5,Unnamed: 0.4,Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,index,title,genre,summary,modified,modified_lemmatized,word_tokenized_summary,modified_summary_NER_tags,modified_summary_NER_tags_1,genre_encoded
0,0,0,0,0,0,0,Drowned Wednesday,fantasy,Drowned Wednesday is the first Trustee among ...,drowned wednesday first trustee among morrow d...,drowned wednesday first trustee among morrow d...,"['drowned', 'wednesday', 'first', 'trustee', '...","{('later wednesday', 'TIME'), ('mantis arthur'...",ORDINAL PERSON DATE PERSON DATE CARDINAL PERSO...,0
1,1,1,1,1,1,1,The Lost Hero,fantasy,"As the book opens, Jason awakens on a school ...",book opens jason awakens school bus unable rem...,book open jason awakens school bus unable reme...,"['book', 'open', 'jason', 'awakens', 'school',...","{('hera', 'ORG'), ('zeus demigod', 'PERSON'), ...",DATE CARDINAL CARDINAL ORG CARDINAL CARDINAL O...,0
2,2,2,2,2,2,2,The Eyes of the Overworld,fantasy,Cugel is easily persuaded by the merchant Fia...,cugel easily persuaded merchant fianosther att...,cugel easily persuaded merchant fianosther att...,"['cugel', 'easily', 'persuaded', 'merchant', '...","{('1974', 'DATE'), ('cugel tongue', 'ORG'), ('...",CARDINAL CARDINAL ORG PERSON DATE DATE,0
3,3,3,3,3,3,3,Magic's Promise,fantasy,The book opens with Herald-Mage Vanyel return...,book opens herald mage vanyel returning countr...,book open herald mage vanyel returning country...,"['book', 'open', 'herald', 'mage', 'vanyel', '...","{('shay', 'ORG'), ('valdemar vanyel', 'PERSON'...",PERSON ORG PERSON CARDINAL CARDINAL CARDINAL C...,0
4,4,4,4,4,4,4,Taran Wanderer,fantasy,Taran and Gurgi have returned to Caer Dallben...,taran gurgi returned caer dallben following ev...,taran gurgi returned caer dallben following ev...,"['taran', 'gurgi', 'returned', 'caer', 'dallbe...","{('castle llyr', 'PERSON'), ('morva', 'PERSON'...",ORDINAL PERSON DATE CARDINAL DATE ORDINAL PERS...,0
5,5,5,5,5,5,5,Thendara House,fantasy,The novel concerns the dwelling of the Darkov...,novel concerns dwelling darkovan order renunci...,novel concern dwelling darkovan order renuncia...,"['novel', 'concern', 'dwelling', 'darkovan', '...","{('thendara house exchange free', 'ORG'), ('ma...",PERSON ORG GPE,0
6,6,6,6,6,6,6,The Thief,fantasy,"Gen is released from prison by the magus, the...",gen released prison magus king scholar magus f...,gen released prison magus king scholar magus f...,"['gen', 'released', 'prison', 'magus', 'king',...","{('two', 'CARDINAL'), ('gen released', 'PERSON')}",PERSON CARDINAL,0
7,7,7,7,7,7,7,The Sweet Far Thing,fantasy,The prologue begins with two men who are sear...,prologue begins two men searching river london...,prologue begin two men searching river london ...,"['prologue', 'begin', 'two', 'men', 'searching...","{('london', 'GPE'), ('three month', 'DATE'), (...",CARDINAL GPE DATE ORG PERSON DATE ORG GPE GPE ...,0
8,8,8,8,8,8,8,Mistborn: The Final Empire,fantasy,"In Luthadel, the capital city of the Final Em...",luthadel capital city final empire vin scrawny...,luthadel capital city final empire vin scrawny...,"['luthadel', 'capital', 'city', 'final', 'empi...","{('marsh kelsier', 'PERSON'), ('dockson ham br...",ORG ORG PERSON DATE PERSON PERSON PERSON ORG C...,0
9,9,9,9,9,9,9,The Sorcerer's Ship,fantasy,A man named Gene finds himself cast into a ne...,man named gene finds cast new world powerful g...,man named gene find cast new world powerful go...,"['man', 'named', 'gene', 'find', 'cast', 'new'...",set(),,0


In [5]:
genre_count2 = filtered_df['genre'].value_counts()
genre_count2

genre
thriller    998
fantasy     860
Name: count, dtype: int64

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
# Drop rows with NaN values in 'modified_summary_NER_tags_1'
filtered_df.dropna(subset=['modified_summary_NER_tags_1'], inplace=True)


In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=59)

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(filtered_df['modified_summary_NER_tags_1'], filtered_df['genre_encoded'], test_size=0.2, random_state=46)

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the testing data
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Initialize and train the classifier (example: Naive Bayes)
classifier = MultinomialNB()
classifier.fit(X_train_tfidf, y_train)

# Predict on the testing data
predictions = classifier.predict(X_test_tfidf)

# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)



Accuracy: 0.6346153846153846


In [9]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Calculate precision
precision = precision_score(y_test, predictions, average='weighted')

# Calculate recall
recall = recall_score(y_test, predictions, average='weighted')

# Calculate F1 score
f1 = f1_score(y_test, predictions, average='weighted')

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


Precision: 0.6357854251012146
Recall: 0.6346153846153846
F1 Score: 0.6219712586190063


In [10]:
from sklearn.model_selection import cross_val_score

# Define the classifier
classifier = MultinomialNB()

# Perform cross-validation
cv_scores = cross_val_score(classifier, X_train_tfidf, y_train, cv=5)

# Calculate mean accuracy of cross-validation scores
mean_cv_accuracy = cv_scores.mean()

print("Cross-Validation Mean Accuracy:", mean_cv_accuracy)


Cross-Validation Mean Accuracy: 0.6221685033771774


In [11]:
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import accuracy_score

# Initialize the classifier
classifier = MultinomialNB()

# Initialize Leave-One-Out Cross-Validation
loo = LeaveOneOut()

# Initialize an empty list to store predictions
predictions = []

# Perform Leave-One-Out Cross-Validation
for train_index, test_index in loo.split(X_train_tfidf):
    X_train_fold, X_test_fold = X_train_tfidf[train_index], X_train_tfidf[test_index]
    y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]
    
    # Fit the classifier on the training fold
    classifier.fit(X_train_fold, y_train_fold)
    
    # Make predictions on the test fold
    fold_predictions = classifier.predict(X_test_fold)
    
    # Append the predictions to the list
    predictions.append(fold_predictions[0])  # Since we are using LOOCV, there's only one prediction per fold

# Calculate accuracy
accuracy = accuracy_score(y_train, predictions)
print("Leave-One-Out Cross-Validation Accuracy:", accuracy)


Leave-One-Out Cross-Validation Accuracy: 0.6118375774260152


In [12]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

# Define the classifier
classifier = MultinomialNB()

# Define the number of folds (k)
k = 5

# Initialize KFold Cross-Validation
kf = KFold(n_splits=k)

# Initialize an empty list to store cross-validation scores
cv_scores = []

# Perform KFold Cross-Validation
for train_index, test_index in kf.split(X_train_tfidf):
    X_train_fold, X_test_fold = X_train_tfidf[train_index], X_train_tfidf[test_index]
    y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]
    
    # Fit the classifier on the training fold
    classifier.fit(X_train_fold, y_train_fold)
    
    # Make predictions on the test fold
    fold_predictions = classifier.predict(X_test_fold)
    
    # Calculate accuracy for this fold
    fold_accuracy = accuracy_score(y_test_fold, fold_predictions)
    
    # Append the accuracy to the list
    cv_scores.append(fold_accuracy)

# Calculate the mean accuracy of cross-validation scores
mean_cv_accuracy = sum(cv_scores) / len(cv_scores)

print("Mean K-Fold Cross-Validation Accuracy:", mean_cv_accuracy)


Mean K-Fold Cross-Validation Accuracy: 0.6111861594975708


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(filtered_df['modified_summary_NER_tags_1'], filtered_df['genre_encoded'], test_size=0.2, random_state=46)

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the testing data
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Initialize and train the classifier (example: Naive Bayes)
classifier = KNeighborsClassifier()
classifier.fit(X_train_tfidf, y_train)

# Predict on the testing data
predictions = classifier.predict(X_test_tfidf)

# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)



Accuracy: 0.6208791208791209


In [14]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Calculate precision
precision = precision_score(y_test, predictions, average='weighted')

# Calculate recall
recall = recall_score(y_test, predictions, average='weighted')

# Calculate F1 score
f1 = f1_score(y_test, predictions, average='weighted')

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


Precision: 0.6231932089074946
Recall: 0.6208791208791209
F1 Score: 0.6215688495858296


In [16]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.metrics import accuracy_score

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(filtered_df['modified_summary_NER_tags_1'], filtered_df['genre_encoded'], test_size=0.2, random_state=46)

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the testing data
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Initialize and train the classifier (example: Naive Bayes)
classifier = LogisticRegression()
classifier.fit(X_train_tfidf, y_train)

# Predict on the testing data
predictions = classifier.predict(X_test_tfidf)

# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions, average='weighted')

# Calculate recall
recall = recall_score(y_test, predictions, average='weighted')

# Calculate F1 score
f1 = f1_score(y_test, predictions, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)




Accuracy: 0.6593406593406593
Precision: 0.658228875343248
Recall: 0.6593406593406593
F1 Score: 0.6584651159506589


In [25]:
def evaluate_model(classifier, X_train, X_test, y_train, y_test):
    # Initialize TF-IDF vectorizer
    tfidf_vectorizer = TfidfVectorizer()

    # Fit and transform the training data
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

    # Transform the testing data
    X_test_tfidf = tfidf_vectorizer.transform(X_test)

    # Initialize and train the classifier
    classifier.fit(X_train_tfidf, y_train)

    # Predict on the testing data
    predictions = classifier.predict(X_test_tfidf)

    # Calculate metrics
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions, average='weighted')
    recall = recall_score(y_test, predictions, average='weighted')
    f1 = f1_score(y_test, predictions, average='weighted')

    return accuracy, precision, recall, f1


In [35]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.dummy import DummyClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier

# Define a list of classifiers
classifiers = [
    KNeighborsClassifier(),
    LogisticRegression(),
    SGDClassifier(),
    RandomForestClassifier(),
    GradientBoostingClassifier(),
    AdaBoostClassifier(),
    BaggingClassifier(),
    SVC(),
    BernoulliNB(),
    MultinomialNB(),
    ExtraTreeClassifier(),
    OneVsRestClassifier(SVC()),
    XGBClassifier()
]

In [37]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(filtered_df['modified_summary_NER_tags_1'], filtered_df['genre_encoded'], test_size=0.2, random_state=46)

# Create an empty DataFrame
results_df = pd.DataFrame(columns=['Classifier', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])

# Loop through classifiers and store results in the DataFrame
for classifier in classifiers:
    accuracy, precision, recall, f1 = evaluate_model(classifier, X_train, X_test, y_train, y_test)
    results_df.loc[len(results_df)] = [type(classifier).__name__, accuracy, precision, recall, f1]

results_df

Unnamed: 0,Classifier,Accuracy,Precision,Recall,F1 Score
0,KNeighborsClassifier,0.620879,0.623193,0.620879,0.621569
1,LogisticRegression,0.659341,0.658229,0.659341,0.658465
2,SGDClassifier,0.637363,0.646726,0.637363,0.637691
3,RandomForestClassifier,0.607143,0.60656,0.607143,0.60681
4,GradientBoostingClassifier,0.642857,0.642504,0.642857,0.642661
5,AdaBoostClassifier,0.645604,0.647097,0.645604,0.646113
6,BaggingClassifier,0.629121,0.631689,0.629121,0.629832
7,SVC,0.673077,0.675557,0.673077,0.673704
8,BernoulliNB,0.612637,0.610819,0.612637,0.611168
9,MultinomialNB,0.634615,0.635785,0.634615,0.621971
