In [None]:
!pip install nltk scikit-learn pandas



In [None]:
# imports

import nltk
import re
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
# sample dataset

data = {
    "text": [
        "NLP is amazing!!!",
        "I love learning AI and NLP",
        "Machine learning is part of AI",
        "Python is used for NLP tasks"
    ],
    "label": ["positive", "positive", "neutral", "neutral"]
}

df = pd.DataFrame(data)
df

Unnamed: 0,text,label
0,NLP is amazing!!!,positive
1,I love learning AI and NLP,positive
2,Machine learning is part of AI,neutral
3,Python is used for NLP tasks,neutral


In [None]:
# Text cleaning function

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text

df['clean_text'] = df['text'].apply(clean_text)
df

Unnamed: 0,text,label,clean_text
0,NLP is amazing!!!,positive,nlp is amazing
1,I love learning AI and NLP,positive,i love learning ai and nlp
2,Machine learning is part of AI,neutral,machine learning is part of ai
3,Python is used for NLP tasks,neutral,python is used for nlp tasks


In [None]:
# Stop word removal
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = text.split()
    return " ".join([word for word in words if word not in stop_words])

df['no_stopwords'] = df['clean_text'].apply(remove_stopwords)
df

Unnamed: 0,text,label,clean_text,no_stopwords
0,NLP is amazing!!!,positive,nlp is amazing,nlp amazing
1,I love learning AI and NLP,positive,i love learning ai and nlp,love learning ai nlp
2,Machine learning is part of AI,neutral,machine learning is part of ai,machine learning part ai
3,Python is used for NLP tasks,neutral,python is used for nlp tasks,python used nlp tasks


In [None]:
# Lemmatization
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    words = text.split()
    return " ".join([lemmatizer.lemmatize(word) for word in words])

df['lemmatized_text'] = df['no_stopwords'].apply(lemmatize_text)
df

Unnamed: 0,text,label,clean_text,no_stopwords,lemmatized_text
0,NLP is amazing!!!,positive,nlp is amazing,nlp amazing,nlp amazing
1,I love learning AI and NLP,positive,i love learning ai and nlp,love learning ai nlp,love learning ai nlp
2,Machine learning is part of AI,neutral,machine learning is part of ai,machine learning part ai,machine learning part ai
3,Python is used for NLP tasks,neutral,python is used for nlp tasks,python used nlp tasks,python used nlp task


In [None]:
# Label Encoding
label_encoder = LabelEncoder()
df['encoded_label'] = label_encoder.fit_transform(df['label'])

df[['label', 'encoded_label']]

Unnamed: 0,label,encoded_label
0,positive,1
1,positive,1
2,neutral,0
3,neutral,0


In [None]:
# TF-IDF Representation
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df['lemmatized_text'])

tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=tfidf.get_feature_names_out()
)

tfidf_df

Unnamed: 0,ai,amazing,learning,love,machine,nlp,part,python,task,used
0,0.0,0.842926,0.0,0.0,0.0,0.538029,0.0,0.0,0.0,0.0
1,0.484263,0.0,0.484263,0.614226,0.0,0.392053,0.0,0.0,0.0,0.0
2,0.437791,0.0,0.437791,0.0,0.555283,0.0,0.555283,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.345783,0.0,0.541736,0.541736,0.541736


In [None]:
# Combine Final Output
final_df = pd.concat([df, tfidf_df], axis=1)
final_df


Unnamed: 0,text,label,clean_text,no_stopwords,lemmatized_text,encoded_label,ai,amazing,learning,love,machine,nlp,part,python,task,used
0,NLP is amazing!!!,positive,nlp is amazing,nlp amazing,nlp amazing,1,0.0,0.842926,0.0,0.0,0.0,0.538029,0.0,0.0,0.0,0.0
1,I love learning AI and NLP,positive,i love learning ai and nlp,love learning ai nlp,love learning ai nlp,1,0.484263,0.0,0.484263,0.614226,0.0,0.392053,0.0,0.0,0.0,0.0
2,Machine learning is part of AI,neutral,machine learning is part of ai,machine learning part ai,machine learning part ai,0,0.437791,0.0,0.437791,0.0,0.555283,0.0,0.555283,0.0,0.0,0.0
3,Python is used for NLP tasks,neutral,python is used for nlp tasks,python used nlp tasks,python used nlp task,0,0.0,0.0,0.0,0.0,0.0,0.345783,0.0,0.541736,0.541736,0.541736


In [None]:
# Save Output to File
final_df.to_csv("assignment3_output.csv", index=False)
print("File saved successfully!")

File saved successfully!
