In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk

from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS

nltk.download('punkt')
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [3]:
stemmer = nltk.stem.PorterStemmer()
def tokenizer(text: str):
  text = text.lower()
  tokens = [word for word in nltk.word_tokenize(text) if word.isalpha() and word not in ENGLISH_STOP_WORDS]
  tokens = [stemmer.stem(word) for word in tokens]
  return tokens

In [5]:
df_train = pd.read_csv('train.tsv', sep='\t')
df_test = pd.read_csv('train.tsv', sep='\t')

df_train.drop('SentenceId', axis=1, inplace=True)
df_test.drop('SentenceId', axis=1, inplace=True)

df_train['Phrase'] = df_train['Phrase'].apply(lambda s: s.replace("n't", "not"))
df_test['Phrase'] = df_test['Phrase'].apply(lambda s: s.replace("n't", "not"))

vectorizer = TfidfVectorizer(
    tokenizer=tokenizer,
    ngram_range=(1, 2),
    stop_words=ENGLISH_STOP_WORDS,
    max_df=0.9,
    max_features=1000
    ).fit(df_train['Phrase'])

train_vec = vectorizer.transform(df_train['Phrase'])
test_vec = vectorizer.transform(df_test['Phrase'])
train_tempdf = pd.DataFrame(train_vec.toarray(), columns=vectorizer.get_feature_names_out())
test_tempdf = pd.DataFrame(test_vec.toarray(), columns=vectorizer.get_feature_names_out())
df_train = pd.concat([df_train, train_tempdf], axis=1)
df_test = pd.concat([df_test, test_tempdf], axis=1)

del train_vec, test_vec, train_tempdf, test_tempdf

df_train.drop('Phrase', axis=1, inplace=True)
df_test.drop('Phrase', axis=1, inplace=True)

In [5]:
df_train.to_csv('Sentiment Analysis on Movie Reviews/vec_train.csv')
df_test.to_csv('Sentiment Analysis on Movie Reviews/vec_test.csv')