In [1]:
import warnings
warnings.filterwarnings('ignore')
import time
import requests
import pandas as pd
pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 5000)
pd.set_option('display.width', 1000)
import numpy as np
import random
from sklearn.ensemble import VotingClassifier 
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV,cross_val_predict
from sklearn.svm import SVC,LinearSVC

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import re
from sklearn.metrics import classification_report, accuracy_score,precision_score,recall_score,roc_auc_score
import optuna
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB

from IPython.display import display, Markdown,HTML

In [2]:
df=pd.read_csv('GfG (25).csv',encoding='ANSI')

In [3]:
df.shape

(4179, 3)

In [4]:
df.head()

Unnamed: 0,Sr No,Headings,Sentiment
0,1,Union Health Ministry to organize a week-long ...,0
1,2,PM Modi to unveil 108-feet statue of Lord Hanu...,0
2,3,Stage set for the final round of Hero 75th Nat...,-1
3,4,"""I would like to be a world record holder some...",0
4,5,Government exempts all customs duty on Cotton ...,0


In [5]:
df.Sentiment.value_counts()

 0    2374
 1    1322
-1     483
Name: Sentiment, dtype: int64

In [6]:
def cleanText(text:str)-> str:
    """
    This function takes a tweet and clean it by removing
    1. Hastags and Mentions
    2. URLS
    """
    text = text.lower()
    hash_pattern  = re.findall("[#|@]\w+",text)
    for p in hash_pattern:
        text = text.replace(p,"")
    urls = re.findall("http[a-z:/0-9.]+",text)
    for u in urls:
        text = text.replace(u,"")
    text =re.findall("\w+",text)
    text = " ".join(text)
    return text

In [7]:
df['Headings'] = df['Headings'].apply(cleanText)

In [8]:
df.head()

Unnamed: 0,Sr No,Headings,Sentiment
0,1,union health ministry to organize a week long ...,0
1,2,pm modi to unveil 108 feet statue of lord hanu...,0
2,3,stage set for the final round of hero 75th nat...,-1
3,4,i would like to be a world record holder somed...,0
4,5,government exempts all customs duty on cotton ...,0


In [9]:
X = df.Headings
y = df.Sentiment

In [10]:
vectorizer = TfidfVectorizer(min_df=0.001)
vectorizer.fit(X)
X_vec = vectorizer.transform(X)

In [11]:
X_vec

<4179x1652 sparse matrix of type '<class 'numpy.float64'>'
	with 39835 stored elements in Compressed Sparse Row format>

In [12]:
x_train,x_test,y_train,y_test = train_test_split(X_vec,y,test_size=0.2,random_state=50)

***Linear SVC***

In [13]:
lsvc = LinearSVC()
lsvc.fit(x_train,y_train)

LinearSVC()

In [14]:
train_pred = lsvc.predict(x_train)
test_pred = lsvc.predict(x_test)
print("Train : ")
print(classification_report(train_pred,y_train))
print("Test : ")
print(classification_report(test_pred,y_test))

Train : 
              precision    recall  f1-score   support

          -1       0.89      0.99      0.94       354
           0       1.00      0.95      0.97      1995
           1       0.94      0.99      0.96       994

    accuracy                           0.97      3343
   macro avg       0.94      0.98      0.96      3343
weighted avg       0.97      0.97      0.97      3343

Test : 
              precision    recall  f1-score   support

          -1       0.63      0.84      0.72        68
           0       0.97      0.85      0.91       530
           1       0.79      0.92      0.85       238

    accuracy                           0.87       836
   macro avg       0.80      0.87      0.83       836
weighted avg       0.89      0.87      0.88       836



In [15]:
def objectivel(trial,x_train,y_train):
    c = trial.suggest_float('C',0,100)
    model = LinearSVC(C=c)
    score = cross_val_score(model,x_train,y_train,n_jobs=-1,cv=5,scoring='f1_macro')
    return score.mean()

study1 = optuna.create_study(direction='maximize',study_name = 'text_classflinear')
func = lambda trial: objectivel(trial,x_train,y_train)
study1.optimize(func,n_trials=100,n_jobs=-1)

[32m[I 2022-04-27 11:55:36,495][0m A new study created in memory with name: text_classflinear[0m
[32m[I 2022-04-27 11:55:46,072][0m Trial 0 finished with value: 0.8172217837872557 and parameters: {'C': 1.1389405872348601}. Best is trial 0 with value: 0.8172217837872557.[0m
[32m[I 2022-04-27 11:55:46,865][0m Trial 2 finished with value: 0.7862719292506919 and parameters: {'C': 10.663980383282535}. Best is trial 0 with value: 0.8172217837872557.[0m
[32m[I 2022-04-27 11:55:47,521][0m Trial 1 finished with value: 0.7658880702222347 and parameters: {'C': 30.38003375071391}. Best is trial 0 with value: 0.8172217837872557.[0m
[32m[I 2022-04-27 11:55:47,765][0m Trial 3 finished with value: 0.7583791104303673 and parameters: {'C': 61.35190353027833}. Best is trial 0 with value: 0.8172217837872557.[0m
[32m[I 2022-04-27 11:55:47,877][0m Trial 4 finished with value: 0.7962072149992488 and parameters: {'C': 5.738257337324482}. Best is trial 0 with value: 0.8172217837872557.[0m
[3

In [16]:
study1.best_params

{'C': 1.0460413230593264}

In [17]:
lsvco = LinearSVC(C = 1.2735466278415457)
lsvco.fit(x_train,y_train)

LinearSVC(C=1.2735466278415457)

In [18]:
train_pred = lsvco.predict(x_train)
test_pred = lsvco.predict(x_test)
print("Train : ")
print(classification_report(train_pred,y_train))
print("Test : ")
print(classification_report(test_pred,y_test))

Train : 
              precision    recall  f1-score   support

          -1       0.91      0.99      0.95       361
           0       1.00      0.96      0.98      1981
           1       0.95      0.99      0.97      1001

    accuracy                           0.97      3343
   macro avg       0.95      0.98      0.96      3343
weighted avg       0.97      0.97      0.97      3343

Test : 
              precision    recall  f1-score   support

          -1       0.64      0.84      0.72        69
           0       0.97      0.85      0.91       530
           1       0.79      0.92      0.85       237

    accuracy                           0.87       836
   macro avg       0.80      0.87      0.83       836
weighted avg       0.89      0.87      0.87       836



In [19]:
def objective(trial,x_train,y_train):
    c = trial.suggest_float('C',0,100)
    k = trial.suggest_categorical('kernel',['linear', 'poly', 'rbf', 'sigmoid'])
    model = SVC(C=c,kernel=k)
    score = cross_val_score(model,x_train,y_train,n_jobs=-1,cv=5,scoring='f1_macro')
    return score.mean()

In [20]:
study2 = optuna.create_study(direction='maximize',study_name = 'text_classf')
func = lambda trial: objective(trial,x_train,y_train)
study2.optimize(func,n_trials=50,n_jobs=-1)

[32m[I 2022-04-27 11:56:24,159][0m A new study created in memory with name: text_classf[0m
[32m[I 2022-04-27 11:56:30,140][0m Trial 0 finished with value: 0.8020822335165392 and parameters: {'C': 5.393414807532615, 'kernel': 'sigmoid'}. Best is trial 0 with value: 0.8020822335165392.[0m
[32m[I 2022-04-27 11:56:33,908][0m Trial 1 finished with value: 0.6950487251353816 and parameters: {'C': 62.168680923509356, 'kernel': 'sigmoid'}. Best is trial 0 with value: 0.8020822335165392.[0m
[32m[I 2022-04-27 11:56:36,998][0m Trial 2 finished with value: 0.7655032778601263 and parameters: {'C': 13.331408513382403, 'kernel': 'linear'}. Best is trial 0 with value: 0.8020822335165392.[0m
[32m[I 2022-04-27 11:56:40,257][0m Trial 3 finished with value: 0.7505926072813519 and parameters: {'C': 53.494285918875576, 'kernel': 'linear'}. Best is trial 0 with value: 0.8020822335165392.[0m
[32m[I 2022-04-27 11:56:46,324][0m Trial 4 finished with value: 0.537357903764019 and parameters: {'C':

In [21]:
study2.best_params

{'C': 5.393414807532615, 'kernel': 'sigmoid'}

In [22]:
model = SVC(C=3.191749602515152, kernel = 'linear')
model.fit(x_train,y_train)

SVC(C=3.191749602515152, kernel='linear')

In [23]:
print('Train :')
print(classification_report(model.predict(x_train),y_train))
print("Test : ")
print(classification_report(model.predict(x_test),y_test))

Train :
              precision    recall  f1-score   support

          -1       0.91      0.98      0.94       365
           0       1.00      0.95      0.97      1997
           1       0.93      0.99      0.96       981

    accuracy                           0.96      3343
   macro avg       0.94      0.97      0.96      3343
weighted avg       0.97      0.96      0.96      3343

Test : 
              precision    recall  f1-score   support

          -1       0.68      0.78      0.73        79
           0       0.96      0.85      0.90       525
           1       0.77      0.92      0.84       232

    accuracy                           0.87       836
   macro avg       0.80      0.85      0.82       836
weighted avg       0.88      0.87      0.87       836



***RFC***

In [24]:
#random forest

In [25]:
from sklearn.ensemble import RandomForestClassifier

In [26]:
rfc = RandomForestClassifier()
rfc.fit(x_train,y_train)

RandomForestClassifier()

In [27]:
print('Train :')
print(classification_report(rfc.predict(x_train),y_train))
print("Test : ")
print(classification_report(rfc.predict(x_test),y_test))

Train :
              precision    recall  f1-score   support

          -1       1.00      1.00      1.00       392
           0       1.00      1.00      1.00      1907
           1       1.00      1.00      1.00      1044

    accuracy                           1.00      3343
   macro avg       1.00      1.00      1.00      3343
weighted avg       1.00      1.00      1.00      3343

Test : 
              precision    recall  f1-score   support

          -1       0.65      0.91      0.76        65
           0       0.99      0.86      0.92       537
           1       0.80      0.94      0.86       234

    accuracy                           0.89       836
   macro avg       0.81      0.90      0.85       836
weighted avg       0.91      0.89      0.89       836



In [28]:
def objectiverfc(trial,x_train,y_train):
    n = trial.suggest_int('n_estimators',0,1000)
    c = trial.suggest_categorical('criterion',['gini','entropy'])
    md = trial.suggest_int('max_depth',1,100)
    model = RandomForestClassifier(n_estimators=n,criterion=c,max_depth=md)
    score = cross_val_score(model,x_train,y_train,n_jobs=-1,cv=5,scoring='f1_macro')
    return score.mean()

study3 = optuna.create_study(direction='maximize',study_name = 'text_classfrfc')
func = lambda trial: objectiverfc(trial,x_train,y_train)
study3.optimize(func,n_trials=100,n_jobs=-1)

[32m[I 2022-04-27 12:00:08,633][0m A new study created in memory with name: text_classfrfc[0m
[32m[I 2022-04-27 12:00:14,774][0m Trial 0 finished with value: 0.2704114539195024 and parameters: {'n_estimators': 420, 'criterion': 'entropy', 'max_depth': 4}. Best is trial 0 with value: 0.2704114539195024.[0m
[32m[I 2022-04-27 12:00:35,499][0m Trial 1 finished with value: 0.5788346852085332 and parameters: {'n_estimators': 905, 'criterion': 'entropy', 'max_depth': 19}. Best is trial 1 with value: 0.5788346852085332.[0m
[32m[I 2022-04-27 12:00:38,172][0m Trial 2 finished with value: 0.7483787350678742 and parameters: {'n_estimators': 451, 'criterion': 'gini', 'max_depth': 40}. Best is trial 2 with value: 0.7483787350678742.[0m
[32m[I 2022-04-27 12:01:00,423][0m Trial 4 finished with value: 0.7041987096936595 and parameters: {'n_estimators': 217, 'criterion': 'gini', 'max_depth': 31}. Best is trial 2 with value: 0.7483787350678742.[0m
[32m[I 2022-04-27 12:01:02,540][0m Trial

In [29]:
study3.best_params

{'n_estimators': 807, 'criterion': 'gini', 'max_depth': 99}

In [30]:
rfc = RandomForestClassifier(n_estimators= 391, criterion='gini', max_depth=100)
rfc.fit(x_train,y_train)

RandomForestClassifier(max_depth=100, n_estimators=391)

In [31]:
print('Train :')
print(classification_report(rfc.predict(x_train),y_train))
print("Test : ")
print(classification_report(rfc.predict(x_test),y_test))

Train :
              precision    recall  f1-score   support

          -1       0.82      1.00      0.90       322
           0       1.00      0.91      0.95      2091
           1       0.89      1.00      0.94       930

    accuracy                           0.94      3343
   macro avg       0.90      0.97      0.93      3343
weighted avg       0.95      0.94      0.95      3343

Test : 
              precision    recall  f1-score   support

          -1       0.58      0.90      0.71        59
           0       0.99      0.83      0.90       556
           1       0.76      0.95      0.84       221

    accuracy                           0.87       836
   macro avg       0.78      0.89      0.82       836
weighted avg       0.90      0.87      0.87       836



In [32]:
vc = vectorizer.transform(['Generative adversarial networks (GANs) have recently found applications in image editing. However, most GAN based image editing methods often require large scale datasets with semantic segmentation annotations for training, only provide high level control, or merely interpolate between different images. Here, we propose EditGAN, a novel method for high quality, high precision semantic image editing, allowing users to edit images by modifying their highly detailed part segmentation masks, e.g., drawing a new mask for the headlight of a car. EditGAN builds on a GAN framework that jointly models images and their semantic segmentations, requiring only a handful of labeled examples, making it a scalable tool for editing. Specifically, we embed an image into the GAN latent space and perform conditional latent code optimization according to the segmentation edit, which effectively also modifies the image. To amortize optimization, we find editing vectors in latent space that realize the edits. The framework allows us to learn an arbitrary number of editing vectors, which can then be directly applied on other images at interactive rates. We experimentally show that EditGAN can manipulate images with an unprecedented level of detail and freedom, while preserving full image quality.We can also easily combine multiple edits and perform plausible edits beyond EditGAN training data. We demonstrate EditGAN on a wide variety of image types and quantitatively outperform several previous editing methods on standard editing benchmark tasks'])
print(rfc.predict(vc))
print(lsvco.predict(vc))
print(model.predict(vc))

[0]
[1]
[1]


***Voting Classifier***

In [33]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=50)

In [34]:
lsvc_pipeline = make_pipeline(TfidfVectorizer(min_df=0.001),LinearSVC(C =0.37694603570267127))
lsvc_pipeline.fit(x_train,y_train)

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(min_df=0.001)),
                ('linearsvc', LinearSVC(C=0.37694603570267127))])

In [35]:
svc_pipeline = make_pipeline(TfidfVectorizer(min_df=0.001),SVC(C=0.6451288708678313, kernel='linear',probability=True))
svc_pipeline.fit(x_train,y_train)

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(min_df=0.001)),
                ('svc',
                 SVC(C=0.6451288708678313, kernel='linear', probability=True))])

***Multinomial NB***

In [36]:
mnb_pipeline = make_pipeline(TfidfVectorizer(min_df=0.001), MultinomialNB(alpha=0.05, fit_prior=True))
mnb_pipeline.fit(x_train,y_train)

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(min_df=0.001)),
                ('multinomialnb', MultinomialNB(alpha=0.05))])

In [37]:
print(classification_report(mnb_pipeline.predict(x_train),y_train))
print(classification_report(mnb_pipeline.predict(x_test),y_test))

              precision    recall  f1-score   support

          -1       0.79      0.91      0.84       337
           0       0.96      0.92      0.94      1993
           1       0.88      0.91      0.89      1013

    accuracy                           0.91      3343
   macro avg       0.87      0.91      0.89      3343
weighted avg       0.92      0.91      0.91      3343

              precision    recall  f1-score   support

          -1       0.40      0.73      0.51        49
           0       0.89      0.80      0.85       520
           1       0.72      0.75      0.73       267

    accuracy                           0.78       836
   macro avg       0.67      0.76      0.70       836
weighted avg       0.81      0.78      0.79       836



In [38]:
rfc_pipeline = make_pipeline(TfidfVectorizer(min_df=0.001),RandomForestClassifier(n_estimators=695, criterion='entropy', max_depth=46))
rfc_pipeline.fit(x_train,y_train)

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(min_df=0.001)),
                ('randomforestclassifier',
                 RandomForestClassifier(criterion='entropy', max_depth=46,
                                        n_estimators=695))])

In [39]:
estimator = []
# estimator.append(('LSVC', lsvc_pipeline))
estimator.append(('SVC', svc_pipeline))
estimator.append(('RFC', rfc_pipeline))
estimator.append(('MNB',mnb_pipeline))

vot_soft = VotingClassifier(estimators=estimator, voting='soft')
vot_soft.fit(x_train,y_train)


VotingClassifier(estimators=[('SVC',
                              Pipeline(steps=[('tfidfvectorizer',
                                               TfidfVectorizer(min_df=0.001)),
                                              ('svc',
                                               SVC(C=0.6451288708678313,
                                                   kernel='linear',
                                                   probability=True))])),
                             ('RFC',
                              Pipeline(steps=[('tfidfvectorizer',
                                               TfidfVectorizer(min_df=0.001)),
                                              ('randomforestclassifier',
                                               RandomForestClassifier(criterion='entropy',
                                                                      max_depth=46,
                                                                      n_estimators=695))])),
                          

In [40]:
train_pred = vot_soft.predict(x_train)
print(classification_report(train_pred,y_train))

test_pred = vot_soft.predict(x_test)
print(classification_report(test_pred,y_test))

              precision    recall  f1-score   support

          -1       0.80      0.95      0.87       327
           0       0.99      0.92      0.95      2061
           1       0.90      0.98      0.94       955

    accuracy                           0.94      3343
   macro avg       0.90      0.95      0.92      3343
weighted avg       0.95      0.94      0.94      3343

              precision    recall  f1-score   support

          -1       0.60      0.90      0.72        61
           0       0.97      0.82      0.89       548
           1       0.75      0.92      0.83       227

    accuracy                           0.86       836
   macro avg       0.77      0.88      0.81       836
weighted avg       0.88      0.86      0.86       836



In [41]:
voting_cvs = cross_val_score(vot_soft,x_train,y_train,cv=5,verbose=2,n_jobs=-1).mean()
voting_cvs

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   46.3s finished


0.8528248435863699

***XGBOOST***

In [42]:
from xgboost import XGBClassifier

In [43]:
xgb = make_pipeline(TfidfVectorizer(min_df=0.001), XGBClassifier())
xgb.fit(x_train,y_train)



Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(min_df=0.001)),
                ('xgbclassifier',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, enable_categorical=False,
                               gamma=0, gpu_id=-1, importance_type=None,
                               interaction_constraints='',
                               learning_rate=0.300000012, max_delta_step=0,
                               max_depth=6, min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=100,
                               n_jobs=4, num_parallel_tree=1,
                               objective='multi:softprob', predictor='auto',
                               random_state=0, reg_alpha=0, reg_lambda=1,
                               scale_pos_weight=None, subsample=1,
                               tree_method=

In [44]:
print(classification_report(xgb.predict(x_train),y_train))
print(classification_report(xgb.predict(x_test),y_test))

              precision    recall  f1-score   support

          -1       0.84      1.00      0.91       329
           0       1.00      0.92      0.96      2063
           1       0.90      0.99      0.94       951

    accuracy                           0.95      3343
   macro avg       0.91      0.97      0.94      3343
weighted avg       0.96      0.95      0.95      3343

              precision    recall  f1-score   support

          -1       0.62      0.86      0.72        65
           0       0.98      0.85      0.91       543
           1       0.78      0.94      0.85       228

    accuracy                           0.87       836
   macro avg       0.79      0.88      0.83       836
weighted avg       0.90      0.87      0.88       836



In [45]:
cvs = cross_val_predict(xgb,x_train,y_train,cv=5,verbose=2,n_jobs=-1).mean()
cvs

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  2.6min finished


0.17170206401435836

***PCA***

In [46]:
from sklearn.decomposition import PCA

In [47]:
X_vec

<4179x1652 sparse matrix of type '<class 'numpy.float64'>'
	with 39835 stored elements in Compressed Sparse Row format>

In [48]:
pca = PCA()
pca.fit(X_vec.A)

PCA()

In [49]:
X_pca = pca.transform(X_vec.A)

In [50]:
x_train,x_test,y_train,y_test = train_test_split(X_pca,y,test_size = 0.2)

In [51]:
linear_svc = LinearSVC()
linear_svc.fit(x_train, y_train)

LinearSVC()

In [52]:
train_pred = linear_svc.predict(x_train)
test_pred = linear_svc.predict(x_test)

In [53]:
print(classification_report(train_pred,y_train))
print(classification_report(test_pred,y_test))

              precision    recall  f1-score   support

          -1       0.89      0.98      0.93       346
           0       0.99      0.96      0.98      1964
           1       0.95      0.98      0.97      1033

    accuracy                           0.97      3343
   macro avg       0.94      0.97      0.96      3343
weighted avg       0.97      0.97      0.97      3343

              precision    recall  f1-score   support

          -1       0.61      0.88      0.72        69
           0       0.96      0.84      0.90       550
           1       0.76      0.89      0.82       217

    accuracy                           0.86       836
   macro avg       0.78      0.87      0.81       836
weighted avg       0.88      0.86      0.86       836



***comparing models***

In [54]:
estimator = []
# estimator.append(('LSVC', lsvc_pipeline))
estimator.append(('SVC', svc_pipeline))
estimator.append(('RFC', rfc_pipeline))
estimator.append(('MNB',mnb_pipeline))

vot_soft = VotingClassifier(estimators=estimator, voting='soft')
vot_soft.fit(x_train,y_train)


AttributeError: 'numpy.ndarray' object has no attribute 'lower'

In [None]:
#models##
#vot_soft
#mnb_pipeline
#rfc_pipeline
#svc_pipeline
#linear_svc
#xgb
#rfc
#lsvco
#model

In [None]:
str = [input("enter string to predict : ")]

a = vot_soft.predict(str)

dic = {-1:'negative',1:'positive',0:'nutral'}

def find_flag(x):
    if x in dic:
        return dic[x]
print("\nCategory : ",find_flag(a[0]))

In [None]:
model.predict(['may be today war ends'])

***saving model***

In [None]:
import pickle

In [None]:
filename = 'news_sentiment_analysis.sav'
pickle.dump(mnb_pipeline, open(filename, 'wb'))