In [1]:
import pandas as pd

# List of encodings to try
encodings = ['utf-8', 'ISO-8859-1', 'cp1252']
column_names = ["target", "ids", "date", "flag", "user", "text"]

# Initialize DataFrame as None
df = None

# Try reading the file with different encodings
for encoding in encodings:
    try:
        df = pd.read_csv("C:\\Users\\user\\Desktop\\guvi_p\\twitter_new.csv", names=column_names, encoding=encoding)
        break  # Stop trying if successful
    except UnicodeDecodeError:
        continue  # Try the next encoding if decoding fails

# Check if DataFrame was successfully loaded
if df is not None:
    print("Successfully loaded the DataFrame.")
else:
    print("Unable to load the DataFrame with any encoding.")


Successfully loaded the DataFrame.


In [2]:
import pandas as pd
from dateutil import parser, tz
import warnings


# Disable the UnknownTimezoneWarning to avoid the warning message
warnings.filterwarnings("ignore", category=parser.UnknownTimezoneWarning)

# Define a custom tzinfos dictionary to map timezone abbreviations to timezone offsets
tzinfos = {"PDT": tz.gettz("America/Los_Angeles")}

# Convert the 'date' column to datetime format with custom tzinfos
df['date'] = df['date'].apply(lambda x: parser.parse(x, tzinfos=tzinfos))


In [3]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')


# Function to clean and preprocess tweet text
def preprocess_tweet(text):
    # Convert to lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove repeating characters
    text = re.sub(r'(\w)\1{2,}', r'\1', text)

    # Remove numeric numbers
    text = re.sub(r'\d+', '', text)

    # Remove punctuation and non-alphabetic characters
    text = re.sub(r'[^\w\s]', '', text)

    # Tokenization using TweetTokenizer
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(text)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Apply lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Join the tokens back into a string
    text = ' '.join(tokens)

    return text


# Apply preprocessing to the 'text' column in-place
df['text'] = df['text'].apply(preprocess_tweet)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
y = df['target']
X= df['text']

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
vectoriser = TfidfVectorizer(ngram_range=(1,2), max_features=500000)
vectoriser.fit(X_train)

In [8]:
X_train = vectoriser.transform(X_train)
X_test  = vectoriser.transform(X_test)

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score


# Train a Logistic Regression classifier
classifier = LogisticRegression(random_state=42)
classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = classifier.predict(X_test)

# Calculate and print accuracy
accuracy = accuracy_score(y_test, y_pred)
precision_test = precision_score(y_test, y_pred, average='weighted')
recall_test = recall_score(y_test, y_pred, average='weighted')
f1_test = f1_score(y_test, y_pred, average='weighted')


print('Accuracy:',accuracy)
print("Precision:", precision_test)
print("Recall:", recall_test)
print("F1 Score:", f1_test)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.80028125
Precision: 0.8005354514476956
Recall: 0.80028125
F1 Score: 0.8002263229925758


In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd



# Define the parameter grid
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
              'penalty': ['l1', 'l2']}

# Create a Logistic Regression model
logreg = LogisticRegression(random_state=42)

# Perform grid search with cross-validation
grid_search = GridSearchCV(logreg, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Print the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Train the model with the best parameters
best_logreg = LogisticRegression(**best_params, random_state=42)
best_logreg.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_logreg.predict(X_test)

# Calculate and print accuracy
accuracy = accuracy_score(y_test, y_pred)
precision_test = precision_score(y_test, y_pred, average='weighted')
recall_test = recall_score(y_test, y_pred, average='weighted')
f1_test = f1_score(y_test, y_pred, average='weighted')


print('Accuracy:',accuracy)
print("Precision:", precision_test)
print("Recall:", recall_test)
print("F1 Score:", f1_test)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Parameters: {'C': 1, 'penalty': 'l2'}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.80028125
Precision: 0.8005354514476956
Recall: 0.80028125
F1 Score: 0.8002263229925758


In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score


# Train a Logistic Regression classifier
classifier = LogisticRegression(random_state=42)
classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = classifier.predict(X_test)

# Calculate and print accuracy
accuracy = accuracy_score(y_test, y_pred)
precision_test = precision_score(y_test, y_pred, average='weighted')
recall_test = recall_score(y_test, y_pred, average='weighted')
f1_test = f1_score(y_test, y_pred, average='weighted')


print('Accuracy:',accuracy)
print("Precision:", precision_test)
print("Recall:", recall_test)
print("F1 Score:", f1_test)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.80028125
Precision: 0.8005354514476956
Recall: 0.80028125
F1 Score: 0.8002263229925758


In [10]:
# OVERFITTING

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression



# Initialize and train your classification model (e.g., Decision Tree, Random Forest, etc.)
clf = LogisticRegression(random_state=42)
clf.fit(X_train, y_train)

# Make predictions on the training and testing data
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)

# Calculate the accuracy of the model on the training and testing data
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

# Print the training and testing accuracy
print("Training Accuracy:", train_accuracy)
print("Testing Accuracy:", test_accuracy)


# Calculate precision, recall, and f1-score for training data
precision_train = precision_score(y_train, y_train_pred, average='weighted')
recall_train = recall_score(y_train, y_train_pred, average='weighted')
f1_train = f1_score(y_train, y_train_pred, average='weighted')

# Calculate precision, recall, and f1-score for testing data
precision_test = precision_score(y_test, y_test_pred, average='weighted')
recall_test = recall_score(y_test, y_test_pred, average='weighted')
f1_test = f1_score(y_test, y_test_pred, average='weighted')

# Print the individual values
print("Training Precision:", precision_train)
print("Testing Precision:", precision_test)

print("Training Recall:", recall_train)
print("Testing Recall:", recall_test)

print("Training F1 Score:", f1_train)
print("Testing F1 Score:", f1_test)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Training Accuracy: 0.8351
Testing Accuracy: 0.80028125
Training Precision: 0.8354181109131364
Testing Precision: 0.8005354514476956
Training Recall: 0.8351
Testing Recall: 0.80028125
Training F1 Score: 0.8350633778897627
Testing F1 Score: 0.8002263229925758


In [12]:
# CROSS VALIDATION

import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline


# Create a pipeline with a text vectorizer and a classifier
model_pipeline = make_pipeline(
    TfidfVectorizer(ngram_range=(1,2), max_features=500000),
    LogisticRegression(random_state=42)
)

# Perform cross-validation
# You can choose the number of folds with the 'cv' parameter
# 'scoring' parameter specifies the evaluation metric, e.g., 'accuracy'
cv_scores = cross_val_score(model_pipeline, X, y, cv=5, scoring='accuracy')

# Print the cross-validation scores
print("Cross-Validation Scores:", cv_scores)
print("Average Accuracy:", cv_scores.mean())




STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Cross-Validation Scores: [0.79307187 0.7905     0.78937813 0.79754375 0.79273437]
Average Accuracy: 0.7926456250000001


In [14]:
import pickle

# Save the trained classifier
with open('logistic_regression_model.pkl', 'wb') as model_file:
    pickle.dump(classifier, model_file)

# Save the vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(vectoriser, vectorizer_file)

# Save the preprocessing function
with open('preprocess_tweet_function.pkl', 'wb') as preprocess_file:
    pickle.dump(preprocess_tweet, preprocess_file)


In [15]:
# Load the trained classifier
with open('logistic_regression_model.pkl', 'rb') as model_file:
    loaded_classifier = pickle.load(model_file)

# Load the vectorizer
with open('tfidf_vectorizer.pkl', 'rb') as vectorizer_file:
    loaded_vectoriser = pickle.load(vectorizer_file)

# Load the preprocessing function
with open('preprocess_tweet_function.pkl', 'rb') as preprocess_file:
    loaded_preprocess_tweet = pickle.load(preprocess_file)

# Example: Preprocess new text using the loaded function and make predictions
new_text = "New tweet text here."
preprocessed_text = loaded_preprocess_tweet(new_text)
new_text_vectorized = loaded_vectoriser.transform([preprocessed_text])
prediction = loaded_classifier.predict(new_text_vectorized)

print("Predicted class:", prediction[0])


Predicted class: 4
