# Importing libraries

In [1]:
# Basic libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import pickle
import warnings
warnings.filterwarnings(action='ignore')

# nltk
import nltk
nltk.download('stopwords')

## Preprocessing libraries
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

# For Model training
from scipy.stats import uniform                 # Used to sample hyperparameter values from a continuous range.
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import LinearSVC              # a variant of SVC optimized for large datasets

# Metrics for accuracy
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report

ModuleNotFoundError: No module named 'numpy'

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Accessing the dataset
dataset_path = "/content/drive/MyDrive/tweet/tweet.csv"

# Reading our Dataset
## Dataset details
- target: the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
- ids: The id of the tweet ( 2087)
- date: the date of the tweet (Sat May 16 23:58:44 UTC 2009)
- flag: The query (lyx). If there is no query, then this value is NO_QUERY.
- user: the user that tweeted (robotickilldozr)
- text: the text of the tweet (Lyx is cool)

In [None]:
columns=["target", "ids", "date", "flag", "user", "text"]
df_read=pd.read_csv(dataset_path,encoding='latin1',names=columns)
print(df_read.shape)
df_read.head()

### Making a DataFrame out of the above Dataset with the only columns that are needed

In [None]:
data={'text':df_read['text'].values,'target':df_read['target'].values}
df=pd.DataFrame(data)
df.head()

In [None]:
# Seeing the distribution of positive and negative tweet reviews in target column
plt.figure(figsize=(7,3))
sns.countplot(data=df,x='target',palette=['green','red'])
plt.show()

In [None]:
# Data clearing and preprocessing
corpus = []
ps=PorterStemmer()
for i in range(len(df)):
    review = re.sub('[^a-zA-Z]', ' ', df['text'][i])         # Removing special characters from text(message)
    review = review.lower()                                  # Converting entire text into lower case
    review = review.split()                                  # Splitting our text into words
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]             # Stemming and removing stopwords
    review = ' '.join(review)                                # Joining all the words into a comple text
    corpus.append(review)                                    # Appending each text into the list corpus

In [None]:
# Creating the Bag of Words model
cv = TfidfVectorizer(ngram_range=(1,2), max_features=500000)

In [None]:
# We will use X as independent feature section
X = cv.fit_transform(corpus)
# We will use y as dependent feature section
y=df['target']

In [None]:
print('No. of feature_words: ', len(cv.get_feature_names_out()))

In [None]:
# Creating a pickle file for the TfidfVectorizer
with open('cv-transform.pkl', 'wb') as f:
    pickle.dump(cv, f)

## Model Training

In [None]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 0)

In [None]:
model1=LogisticRegression()
model2=BernoulliNB()
model3=LinearSVC()
model=[model1, model2, model3]

In [None]:
i = 0
for algo in model:
  i += 1
  print("M-O-D-E-L :",i)
  algo.fit(X_train, y_train)
  y_pred=algo.predict(X_test)
  # Checking the accuracy
  print("Confusion matrix : \n",confusion_matrix(y_pred,y_test))
  print("Accuracy score : ",accuracy_score(y_pred,y_test))
  print("Classification Report : \n",classification_report(y_pred,y_test))
  print("-----------------------------------------------------------\n")

NOTE :- Model1 is performing the best i.e. Logistic Regression

## Doing Hyperparameter Tuning for Logistic Regression

In [None]:
# Define the hyperparameters to be tuned and their search ranges
param_dist = {'C': uniform(0.1, 1.0),
              'penalty': ['l2'],
              'solver': ['liblinear', 'saga']}

In [None]:
LogisticRegression = RandomizedSearchCV(estimator=model1,param_distributions=param_dist,n_iter=10, cv=5, n_jobs=-1, scoring='accuracy')
LogisticRegression.fit(X_train, y_train)

In [None]:
# Print the best parameters and score
print("Best parameters: ", LogisticRegression.best_params_)
print("Best score: ", LogisticRegression.best_score_)

In [None]:
# Training model using Naive bayes classifier
y_pred=LogisticRegression.predict(X_test)

In [None]:
# Checking the accuracy
print("Confusion matrix : \n",confusion_matrix(y_pred,y_test))
print("Accuracy score : ",accuracy_score(y_pred,y_test))
print("Classification Report : \n",classification_report(y_pred,y_test))

In [2]:
# Creating a pickle file for the Logistic Regression model
with open("tweetmodel.pkl","wb") as file:
  pickle.dump(LogisticRegression,file)

NameError: name 'pickle' is not defined