In [1]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import re
import nltk
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
import pandas as pd
import joblib

In [2]:
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("vader_lexicon")

[nltk_data] Downloading package punkt to /home/datacoding/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/datacoding/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/datacoding/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/datacoding/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [3]:
df = pd.read_csv("combined_data_all_new.csv")
df.head()

Unnamed: 0,fullname,Followers,retweets,tweets,timestamp,tweet_id,url,username
0,BilawalBhuttoZardari,5100000,4035,"For the first time in the history of Karachi, ...",2023-06-19 15:29:00+00:00,1.67e+18,https://twitter.com/BBhuttoZardari/status/1670...,@BBhuttoZardari
1,BilawalBhuttoZardari,5100000,5577,"Thanks Swat! 🙏🏽 🙏🏽\nToday, we come together to...",2023-06-17 16:38:00+00:00,1.67e+18,https://twitter.com/BBhuttoZardari/status/1670...,@BBhuttoZardari
2,BilawalBhuttoZardari,5100000,4118,My Depest Sympathies With the Families of the ...,2023-06-17 11:09:00+00:00,1.67e+18,https://twitter.com/BBhuttoZardari/status/1670...,@BBhuttoZardari
3,BilawalBhuttoZardari,5100000,4757,I thank the Members of Asia Pacific Group at u...,2023-06-22 11:51:00+00:00,1.67e+18,https://twitter.com/BBhuttoZardari/status/1671...,@BBhuttoZardari
4,BilawalBhuttoZardari,5100000,4881,"Our party has always strived for, and Led the ...",2023-06-16 17:38:00+00:00,1.67e+18,https://twitter.com/BBhuttoZardari/status/1669...,@BBhuttoZardari


In [4]:
df.isnull().sum()

fullname     0
Followers    0
retweets     0
tweets       0
timestamp    0
tweet_id     0
url          0
username     0
dtype: int64

In [5]:
def clean_text(text):
        text = str(text)
        text = re.sub(r'@[A-Za-z0-9]+', '', text)  # remove mentions
        text = re.sub(r'#', '', text)  # remove hashtags
        text = re.sub(r'RT[\s]+', '', text)  # remove retweets
        text = re.sub(r'https?:\/\/\S+', '', text)  # remove links
        text = re.sub(r'[^A-Za-z0-9\s]+', '', text)  # remove special characters
       
        return " ".join(nltk.word_tokenize(text.lower().strip()))

df["cleaned_tweets"] = df["tweets"].apply(clean_text)

In [6]:
df["cleaned_tweets"].head()

0    for the first time in the history of karachi i...
1    thanks swat today we come together to maintain...
2    my depest sympathies with the families of the ...
3    i thank the members of asia pacific group at u...
4    our party has always strived for and led the w...
Name: cleaned_tweets, dtype: object

In [7]:
df["cleaned_tweets"][0]

'for the first time in the history of karachi i congratulate pakistan peoples partys barrister murtaza wahab on mayor karachi and salman abdullah murad on the oath of office it is hoped that the mayor karachi and deputy mayor karachi will deliver the city to new heights of development while serving the people of karachi'

In [8]:
def analyze_sentiment(text):
        
        sid = SentimentIntensityAnalyzer()
        sentiment_score = sid.polarity_scores(text)['compound']
        sentiment_tag = 'positive' if sentiment_score > 0 else ('negative' if sentiment_score < 0 else 'neutral')
        
        return sentiment_score, sentiment_tag

df['sentiment_score'], df['sentiment_tag'] = zip(*df['cleaned_tweets'].apply(analyze_sentiment))




In [9]:
df.head(3)

Unnamed: 0,fullname,Followers,retweets,tweets,timestamp,tweet_id,url,username,cleaned_tweets,sentiment_score,sentiment_tag
0,BilawalBhuttoZardari,5100000,4035,"For the first time in the history of Karachi, ...",2023-06-19 15:29:00+00:00,1.67e+18,https://twitter.com/BBhuttoZardari/status/1670...,@BBhuttoZardari,for the first time in the history of karachi i...,0.7003,positive
1,BilawalBhuttoZardari,5100000,5577,"Thanks Swat! 🙏🏽 🙏🏽\nToday, we come together to...",2023-06-17 16:38:00+00:00,1.67e+18,https://twitter.com/BBhuttoZardari/status/1670...,@BBhuttoZardari,thanks swat today we come together to maintain...,0.9666,positive
2,BilawalBhuttoZardari,5100000,4118,My Depest Sympathies With the Families of the ...,2023-06-17 11:09:00+00:00,1.67e+18,https://twitter.com/BBhuttoZardari/status/1670...,@BBhuttoZardari,my depest sympathies with the families of the ...,-0.872,negative


In [10]:
X = df["cleaned_tweets"]
y = df["sentiment_tag"]

In [11]:
# Assuming you have preprocessed data and labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [12]:
X_train

4795    heard with concern about hospitalisation of hi...
289     pakistan strongly condemns the insensitive and...
2589                                       i am on my way
4126    the details of reservations on the amendments ...
4494    preparing a cricket ground for the youngsters ...
                              ...                        
4426    saddened to learn of rahimullah yousafzais pas...
466     heartest felications to president xi on which ...
3092    from every citizen especially those who will e...
3772    after informing the president about the danger...
860     renala khurd ki awaam ka faisla puppet pm par ...
Name: cleaned_tweets, Length: 3880, dtype: object

In [15]:
# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
joblib.dump(vectorizer, "tf-idf-vectorizer.pkl")
X_test_tfidf = vectorizer.transform(X_test)

In [16]:
# save the TF-IDF vectorizer


In [17]:
model = LogisticRegression(solver="lbfgs")
model.fit(X_train_tfidf, y_train)

In [18]:
# Make predictions
y_pred = model.predict(X_test_tfidf)

In [19]:
#  Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7971163748712667


In [21]:
import joblib
from sklearn.metrics import accuracy_score, classification_report


In [22]:
joblib.dump(model, "logistic_regression_model.pkl")

['logistic_regression_model.pkl']

In [23]:
#  Generate a classification report
class_report = classification_report(y_test, y_pred)

# Save the model and its results
saved_model = {
    "model": model,
    "vectorizer": vectorizer,
    "accuracy": accuracy,
    "classification_report": class_report
}



In [24]:
print(class_report)

              precision    recall  f1-score   support

    negative       0.78      0.76      0.77       335
     neutral       0.81      0.80      0.81       208
    positive       0.80      0.82      0.81       428

    accuracy                           0.80       971
   macro avg       0.80      0.79      0.80       971
weighted avg       0.80      0.80      0.80       971



In [25]:
# joblib.dump(saved_model, "logestics_regression_model.pkl")

In [26]:
# import joblib

# # Load the saved model
# loaded_model = joblib.load("logistic_regression_model.pkl")

# # Get the loaded model and vectorizer
# model = loaded_model["model"]
# vectorizer = loaded_model["vectorizer"]

# # Get user input
# user_input = input("Enter a tweet: ")

# # Preprocess the user input
# cleaned_input = clean_text(user_input)  # You need to define preprocess_text function

# # Vectorize the preprocessed input
# input_vector = vectorizer.transform([cleaned_input])

# # Make a prediction
# prediction = model.predict(input_vector)

# # Print the sentiment prediction
# print("Predicted Sentiment:", prediction[0])


TypeError: 'LogisticRegression' object is not subscriptable

In [31]:
# def map_to_sentiment_label(predicted_sentiment):
#     if predicted_sentiment > 0:  # Assuming 1 represents positive sentiment, adjust as needed
#         return "positive"
#     elif predicted_sentiment < 0:  # Assuming 0 represents neutral sentiment, adjust as needed
#         return "negative"
#     else:  # Assuming -1 represents negative sentiment, adjust as needed
#         return "neutral"

# # Example usage:
# # predicted_sentiment = 1  # Replace with the actual predicted sentiment value
# # predicted_sentiment_label = map_to_sentiment_label(predicted_sentiment)
# # print(f"Predicted Sentiment: {predicted_sentiment_label}")


In [33]:
# import joblib

# # Load the logistic regression model
# loaded_model = joblib.load("logistic_regression_model.pkl")

# # Get user input
# user_input = "Some text you want to classify"

# # Preprocess user input (use the same preprocessing used during training)
# preprocessed_input = clean_text(user_input)

# # Vectorize the preprocessed input using the same vectorizer used during training
# input_vector = vectorizer.transform([preprocessed_input])

# # Predict the sentiment using the loaded logistic regression model
# predicted_sentiment = loaded_model.predict(input_vector)

# # Map the predicted sentiment to the appropriate label (e.g., "positive", "negative")
# predicted_sentiment_label = map_to_sentiment_label(predicted_sentiment)

# # Print or use the predicted sentiment label as needed
# print(f"Predicted Sentiment: {predicted_sentiment_label}")


In [35]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Create an SVM model
svm_model = SVC(kernel='linear')

# Train the SVM model
svm_model.fit(X_train_tfidf, y_train)

# Make predictions on the test set
svm_predictions = svm_model.predict(X_test_tfidf)

# Calculate accuracy
svm_accuracy = accuracy_score(y_test, svm_predictions)

print("SVM Accuracy:", svm_accuracy)


SVM Accuracy: 0.8053553038105047


In [36]:
import joblib

# Save the SVM model to a .pkl file
model_filename = 'svm_model.pkl'
joblib.dump(svm_model, model_filename)


['svm_model.pkl']

In [37]:
# Load the SVM model from the .pkl file
loaded_model = joblib.load(model_filename)

# Now you can use loaded_model for predictions
svm_predictions = loaded_model.predict(X_test_tfidf)

In [38]:
svm_predictions

array(['negative', 'neutral', 'negative', 'negative', 'negative',
       'negative', 'negative', 'neutral', 'positive', 'negative',
       'negative', 'negative', 'neutral', 'negative', 'negative',
       'positive', 'negative', 'negative', 'positive', 'positive',
       'positive', 'positive', 'positive', 'negative', 'positive',
       'positive', 'positive', 'positive', 'neutral', 'negative',
       'neutral', 'neutral', 'positive', 'positive', 'positive',
       'negative', 'negative', 'neutral', 'neutral', 'negative',
       'neutral', 'neutral', 'positive', 'neutral', 'negative',
       'positive', 'negative', 'neutral', 'positive', 'neutral',
       'negative', 'negative', 'positive', 'positive', 'positive',
       'positive', 'positive', 'negative', 'positive', 'negative',
       'positive', 'neutral', 'positive', 'neutral', 'negative',
       'neutral', 'neutral', 'negative', 'negative', 'negative',
       'neutral', 'positive', 'negative', 'negative', 'neutral',
       'positi

Combining multiple models

In [25]:
# from sklearn.ensemble import VotingClassifier

# # Create a list of models
# models = [('logreg', logistic_regression_model), ('svm', svm_model), ('rf', random_forest_model)]

# # Create a VotingClassifier
# voting_classifier = VotingClassifier(estimators=models, voting='hard')

# # Train the VotingClassifier
# voting_classifier.fit(X_train_tfidf, y_train)

# # Make predictions using the VotingClassifier
# ensemble_predictions = voting_classifier.predict(X_test_tfidf)


In [47]:
# from sklearn.model_selection import GridSearchCV

# # Define a range of hyperparameters to search
# param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2']}

# # Create a grid search object
# grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5)

# # Fit the grid search to your data
# grid_search.fit(X_train_tfidf, y_train)

# # Get the best hyperparameters
# best_params = grid_search.best_params_
# print("Best Hyperparameters:", best_params)

In [33]:
# from sklearn.model_selection import RandomizedSearchCV
# from scipy.stats import uniform

# # Define a distribution over hyperparameters
# param_dist = {'C': uniform(0.001, 100), 'penalty': ['l1', 'l2']}

# # Create a random search object
# random_search = RandomizedSearchCV(LogisticRegression(), param_distributions=param_dist, n_iter=100, cv=5)

# # Fit the random search to your data
# random_search.fit(X_train_tfidf, y_train)

# # Get the best hyperparameters
# best_params = random_search.best_params_
# print("Best Hyperparameters:", best_params)


In [26]:
# predict the output on the user input text
# user_input = input("enter text: ")
