In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<h2>Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
# printing the stopwords in english
print(stopwords.words('english'))
#these words are not required as they don't add much importance
#it also reduce the size of the dataset

<h2>Data Processing

In [None]:
#loading the data from csv file
twitter_data = pd.read_csv('/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv', encoding = 'ISO-8859-1')
twitter_data

In [None]:
#naming the columns
columns_names = ['target', 'id', 'date', 'flag', 'user', 'text']
twitter_data = pd.read_csv('/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv', names = columns_names, encoding = 'ISO-8859-1')
twitter_data

In [None]:
twitter_data.shape

In [None]:
#counting the number of missing values
twitter_data.isnull().sum()

In [None]:
#counting the number of duplicate values
twitter_data.duplicated().sum()

In [None]:
#checking the distribution of target
twitter_data['target'].value_counts()

<h3>Converting the target '4' to '1'

In [None]:
twitter_data.replace({'target':{4:1}}, inplace = True)

In [None]:
#checking the distribution of target
twitter_data['target'].value_counts()

<h3>0 --> Negative Tweet
    <br>
<h3>1 --> Positive Tweet

<h2><b>Stemming</b></h2>
    <br>
<h4>Stemming is the process of reducing a word to its Root word

In [None]:
port_stem = PorterStemmer()

In [None]:
def stemming(content):
    
    stemmed_content = re.sub('[^a-zA-Z]',' ', content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    
    return stemmed_content

In [None]:
twitter_data['stemmed_content'] = twitter_data['text'].apply(stemming)

In [None]:
twitter_data.head()

In [None]:
twitter_data.drop(['text'], axis =1,inplace = True)

In [None]:
#separating the data and label
X = twitter_data['stemmed_content'].values
Y = twitter_data['target'].values

<h2>Splitting the data to training data and test data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y, random_state = 42)

In [None]:
del X
del Y

<h2>Converting the textual data to numerical data

In [None]:
vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

<h2>Training the Model

In [None]:
model = LogisticRegression(max_iter = 1000)


In [None]:
model.fit(X_train, y_train)

<h2>Model Evaluation

In [None]:
y_pred = model.predict(X_test)

print('Accuracy Score', accuracy_score(y_test, y_pred))
print('\nPrecision Score', precision_score(y_test, y_pred))
print('\nRecall Score', recall_score(y_test, y_pred))
print('\nf1 Score', f1_score(y_test, y_pred))

print('\nConfusion Matrix\n', confusion_matrix(y_test, y_pred))
print('\nClassification Report\n',classification_report(y_test, y_pred))

# Save the Model

In [None]:
import joblib

# Save the model and vectorizer
joblib.dump(model, 'sentiment_model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')
