In [1]:
# import nltk

# # Download the stopwords corpus
# nltk.download('stopwords')

# # Now you can safely import and use stopwords
# from nltk.corpus import stopwords

In [2]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

# Load dataset
train_df = pd.read_csv(r"D:\sentiment-analysis-webapp\data\train.csv",encoding='ISO-8859-1')
test_df = pd.read_csv(r"D:\sentiment-analysis-webapp\data\test.csv",encoding='ISO-8859-1')

# Preprocess data
def preprocess_text(text):
    tokenizer = TweetTokenizer()
    stemmer = SnowballStemmer('english')
    stop_words = set(stopwords.words('english'))

    tokens = tokenizer.tokenize(text)
    stemmed = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return ' '.join(stemmed)

# Ensure text columns are of type string
train_df['text'] = train_df['text'].astype(str)
test_df['text'] = test_df['text'].astype(str)

# Now apply your preprocessing function
train_df['cleaned_text'] = train_df['text'].apply(preprocess_text)
test_df['cleaned_text'] = test_df['text'].apply(preprocess_text)

vectorizer = TfidfVectorizer(max_features=1000)
X_train_vectors = vectorizer.fit_transform(train_df['cleaned_text'])

# Model
model = LogisticRegression()
model.fit(X_train_vectors, train_df['sentiment'])

In [3]:
X_test_vectors = vectorizer.transform(test_df['cleaned_text'])
predictions = model.predict(X_test_vectors)

# Evaluation
print("Accuracy:", accuracy_score(test_df['sentiment'], predictions))

Accuracy: 0.6975099037917374


In [4]:
# Setup grid search


In [5]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

# Load dataset
train_df = pd.read_csv(r"D:\sentiment-analysis-webapp\data\train.csv",encoding='ISO-8859-1')
test_df = pd.read_csv(r"D:\sentiment-analysis-webapp\data\test.csv",encoding='ISO-8859-1')

# Preprocess data
def preprocess_text(text):
    tokenizer = TweetTokenizer()
    stemmer = SnowballStemmer('english')
    stop_words = set(stopwords.words('english'))

    tokens = tokenizer.tokenize(text)
    stemmed = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return ' '.join(stemmed)

## Load dataset
train_df = pd.read_csv(r"D:\sentiment-analysis-webapp\data\train.csv", encoding='ISO-8859-1')

# Preprocessing text
train_df['text'] = train_df['text'].astype(str)
train_df['cleaned_text'] = train_df['text'].apply(preprocess_text)

# Vectorization
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(train_df['cleaned_text'])
y = train_df['sentiment']

# Splitting dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model
model = LogisticRegression()

# Define the hyperparameter grid
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga']  # These solvers are good for smaller datasets and support both l1 and l2 penalties
}

# Setup GridSearchCV
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', verbose=1)

# Perform the grid search on the training data
grid_search.fit(X_train, y_train)

# Best parameters
print("Best parameters found: ", grid_search.best_params_)

# Best score
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

# Evaluate on the validation set
best_model = grid_search.best_estimator_
val_predictions = best_model.predict(X_val)
print("Validation set accuracy: {:.2f}".format(accuracy_score(y_val, val_predictions)))

Fitting 5 folds for each of 24 candidates, totalling 120 fits




Best parameters found:  {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}
Best cross-validation score: 0.70
Validation set accuracy: 0.70




In [6]:
train_df.head()

Unnamed: 0,textID,text,sentiment,Time of Tweet,Age of User,Country,cleaned_text
0,cb774db0d1,"I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,"i ` respond , i go"
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,negative,noon,21-30,Albania,sooo sad i miss san diego ! ! !
2,088c60f138,my boss is bullying me...,negative,night,31-45,Algeria,boss bulli ...
3,9642c003ef,what interview! leave me alone,negative,morning,46-60,Andorra,interview ! leav alon
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...",negative,noon,60-70,Angola,"son * * * , ` put releas alreadi bought"


In [7]:
train_df.isnull().sum()

textID           0
text             0
sentiment        0
Time of Tweet    0
Age of User      0
Country          0
cleaned_text     0
dtype: int64

In [8]:
# import pandas as pd
# from nltk.corpus import stopwords
# from nltk.stem import SnowballStemmer
# from nltk.tokenize import TweetTokenizer
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import train_test_split, GridSearchCV
# from sklearn.pipeline import Pipeline
# from sklearn.metrics import accuracy_score

# # Assuming your preprocess_text function is defined above

# # Load dataset
# train_df = pd.read_csv(r"D:\sentiment-analysis-webapp\data\train.csv", encoding='ISO-8859-1')

# # Preprocessing text
# train_df['text'] = train_df['text'].astype(str)
# train_df['cleaned_text'] = train_df['text'].apply(preprocess_text)

# X = train_df['cleaned_text']
# y = train_df['sentiment']

# # Splitting dataset into training and validation sets
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# # Define a pipeline with TfidfVectorizer and LogisticRegression
# pipeline = Pipeline([
#     ('tfidf', TfidfVectorizer()),
#     ('clf', LogisticRegression()),
# ])

# # Define the hyperparameter grid to search
# param_grid = {
#     'tfidf__max_df': [0.5, 0.75, 1.0],
#     'tfidf__min_df': [1, 2, 3],
#     'tfidf__ngram_range': [(1, 1), (1, 2)],
#     'tfidf__max_features': [500, 1000, None],
#     'clf__C': [0.001, 0.01, 0.1, 1, 10, 100],
#     'clf__penalty': ['l1', 'l2'],
#     'clf__solver': ['liblinear', 'saga']  # These solvers are good for smaller datasets and support both l1 and l2 penalties
# }

# # Setup GridSearchCV
# grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', verbose=1)

# # Perform the grid search on the training data
# grid_search.fit(X_train, y_train)

# # Best parameters
# print("Best parameters found: ", grid_search.best_params_)

# # Best score
# print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

# # Evaluate on the validation set
# best_model = grid_search.best_estimator_
# val_predictions = best_model.predict(X_val)
# print("Validation set accuracy: {:.2f}".format(accuracy_score(y_val, val_predictions)))


Fitting 5 folds for each of 1296 candidates, totalling 6480 fits




KeyboardInterrupt: 