In [None]:
from google.colab import files
files.upload()

In [None]:
import pandas as pd
import re
saved_df= pd.read_csv('ANI_Dataset.csv')

In [None]:
data = saved_df['tweet_text']
labels = []
for label in saved_df['tweet_labels']:
  if(label):
    labels.append(1)
  else:
    labels.append(0)

In [None]:
import requests
def download_file(url):
    local_filename = url.split('/')[-1]
    # NOTE the stream=True parameter below
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192): 
                if chunk: # filter out keep-alive new chunks
                    f.write(chunk)
                    # f.flush()
    return local_filename

In [None]:
def removePunctuations(sentence):
  punctList = '''!()-[]{};:'"\,./?@#$%^&@*_~'''
  withoutPunctuation=""
  for character in sentence:
    if(character not in punctList):
      withoutPunctuation+=character
  return withoutPunctuation

In [None]:
def getCommonWords():
  bagOfWords = {}
  download_file('https://www.gutenberg.org/files/766/766-0.txt')
  f= open('766-0.txt','r')
  for line in f.readlines():
    s = line.lower()
    s = s.strip()
    s = removePunctuations(s)
    s = s.split(' ')
    for word in s:
      if(word in bagOfWords.keys()):
        bagOfWords[word]+=1
      else:
        bagOfWords[word]=0
  bagOfWords = {key:value for key,value in sorted(bagOfWords.items(),key=lambda item:item[1],reverse=True)}
  commonWords = [k for (k,v) in bagOfWords.items() if v>900 ]
  return commonWords

In [None]:
commonWords = getCommonWords()

In [None]:
def cleanSentence(sentence):
  s = sentence.lower()
  s = [re.sub(r'http\S+','',s)]
  s = [re.sub(r'[^A-Za-z0-9 ]','', s[0])]
  s = [re.sub(' +', ' ', s[0])]
  s = removePunctuations(s[0])
  s = [word for word in s.split(' ') if(word not in commonWords)]
  s = ' '.join(s)
  return s

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer

def tokenize_and_stem_data(newData):
  tokenized_data = []

  for sentence in newData:
    tokens = word_tokenize(sentence)
    tokenized_data.append(tokens)

  data = tokenized_data
  stemmer = SnowballStemmer('english')
  stemmedData = []
  for sentence in data:
    wordArray = []
    for word in sentence:
      # print(word)
      word = stemmer.stem(word)
      wordArray.append(word)
    stemmedData.append(' '.join(wordArray))

  return stemmedData

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
vectorizer = TfidfVectorizer()
pca = PCA(n_components = 100)
def vectorize_and_pca(data):
  X = vectorizer.fit_transform(data)  
  X = pca.fit_transform(X.toarray())
  return X
def transform_to_vectorize_pca(data):
  X = vectorizer.transform(data)  
  X = pca.transform(X.toarray())
  return X

In [None]:
cleanedData = []
for sentence in data:
  sentencecleaned = cleanSentence(sentence)
  cleanedData.append(sentencecleaned)

#After Cleaning Tokenize and Stem it

tokenized_and_stemmed_data = tokenize_and_stem_data(cleanedData)

X = vectorize_and_pca(tokenized_and_stemmed_data) 


In [None]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,labels,test_size=0.25,random_state=42)



In [None]:
from sklearn.linear_model import LogisticRegression
classifier =  LogisticRegression()
classifier.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
predicted=classifier.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test,predicted)
print(cm)
accuracy = (cm[0][0]+cm[1][1])/len(y_test)
print(accuracy)

[[321  26]
 [ 94  81]]
0.7701149425287356


In [None]:
from joblib import dump
dump(classifier,'classifier.joblib')
dump(pca,'pca.joblib')
dump(vectorizer,'vectorizer.joblib')
dump(commonWords,'commonwords.pkl') 

['commonwords.pkl']

In [None]:
files.download('classifier.joblib')
files.download('pca.joblib')
files.download('vectorizer.joblib')
files.download("commonwords.pkl")

In [None]:
unknownsample = 'A 16-year-old girl from Salmara Bongaigaon, a secondary contact of a Markaz attendee, has tested #COVID19 positive. Number of #COVID19 patients now stands at 36: Assam Health Minister Himanta Biswa Sarma'

In [None]:
cleanedSample =  cleanSentence(unknownsample)
cleanedSample = [cleanedSample]
cleanedSample

['16yearold girl salmara bongaigaon secondary contact markaz attendee has tested covid19 positive number covid19 patients now stands 36 assam health minister himanta biswa sarma']

In [None]:
tokenize_and_stem_sample = tokenize_and_stem_data(cleanedSample)

In [None]:
vectorized_pca_sample = transform_to_vectorize_pca(tokenize_and_stem_sample)
classifier.predict(vectorized_pca_sample)[0]

1

In [None]:
df = pd.read_csv('test_file.csv')
new_data = df['tweet_text']
new_labels = []
for label in df['tweet_labels']:
  if(label):
    new_labels.append(1)
  else:
    new_labels.append(0)

In [None]:
predictions = []
ctr=1
for data in new_data:
  cleanedSample =  cleanSentence(data)
  cleanedSample = [cleanedSample]
  tokenize_and_stem_sample = tokenize_and_stem_data(cleanedSample)
  vectorized_pca_sample = transform_to_vectorize_pca(tokenize_and_stem_sample)
  predictedclass = classifier.predict(vectorized_pca_sample)[0]
  predictions.append(predictedclass)

In [None]:
cm=confusion_matrix(new_labels,predictions)
print(cm)
accuracy = (cm[0][0]+cm[1][1])/len(predictions)
print(accuracy)

[[225  15]
 [ 73  66]]
0.7678100263852242


In [None]:
dataValue=[]
for i in range(len(predictions)):
  dataValue.append([new_data[i],new_labels[i],predictions[i]])
newDf = pd.DataFrame(dataValue,columns=['tweet','actual_label','predicted_label'])
newDf.to_csv('Predicted_Datasets.csv')

In [None]:
files.download('Predicted_Datasets.csv')