### Importing Libraries

In [2]:
import pandas as pd
import numpy as np
import re
import string
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import RegexpTokenizer
from nltk import PorterStemmer, WordNetLemmatizer
import pickle

In [3]:
# Importing dataset
data = pd.read_csv('cyberbullying_tweet.csv')

#### Encoding the *classification_type* values.

In [4]:
labelencoder = LabelEncoder()
data['cyberbullying_type_encoded'] = labelencoder.fit_transform(data['cyberbullying_type'])
data[['cyberbullying_type', 'cyberbullying_type_encoded']].value_counts()

cyberbullying_type  cyberbullying_type_encoded
cyberbullying       0                             37008
not_cyberbullying   1                             34972
dtype: int64

#### Preprocessing

In [5]:
# preprocessing functions

# converting tweet text to lower case
def text_lower(text):
    return text.str.lower()

# removing stopwoords from the tweet text
def clean_stopwords(text):
    # stopwords list that needs to be excluded from the data
    stopwordlist = ['a', 'about', 'above', 'after', 'again', 'ain', 'all', 'am', 'an',
             'and','any','are', 'as', 'at', 'be', 'because', 'been', 'before',
             'being', 'below', 'between','both', 'by', 'can', 'd', 'did', 'do',
             'does', 'doing', 'down', 'during', 'each','few', 'for', 'from',
             'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here',
             'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in',
             'into','is', 'it', 'its', 'itself', 'just', 'll', 'm', 'ma',
             'me', 'more', 'most','my', 'myself', 'now', 'o', 'of', 'on', 'once',
             'only', 'or', 'other', 'our', 'ours','ourselves', 'out', 'own', 're',
             's', 'same', 'she', "shes", 'should', "shouldve",'so', 'some', 'such',
             't', 'than', 'that', "thatll", 'the', 'their', 'theirs', 'them',
             'themselves', 'then', 'there', 'these', 'they', 'this', 'those',
             'through', 'to', 'too','under', 'until', 'up', 've', 'very', 'was',
             'we', 'were', 'what', 'when', 'where','which','while', 'who', 'whom',
             'why', 'will', 'with', 'won', 'y', 'you', "youd","youll", "youre",
             "youve", 'your', 'yours', 'yourself', 'yourselves']
    STOPWORDS = set(stopwordlist)
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

# cleaning and removing punctuations
def clean_puctuations(text):
    english_puctuations = string.punctuation
    translator = str.maketrans('','', english_puctuations)
    return text.translate(translator)

# cleaning and removing repeating characters
def clean_repeating_characters(text):
    return re.sub(r'(.)1+', r'1', text)

# cleaning and removing URLs
def clean_URLs(text):
    return re.sub(r"((www.[^s]+)|(http\S+))","",text)

# cleaning and removing numeric data
def clean_numeric(text):
    return re.sub('[0-9]+', '', text)

# Tokenization of tweet text
def tokenize_tweet(text):
    tokenizer = RegexpTokenizer('\w+')
    text = text.apply(tokenizer.tokenize)
    return text

# stemming    
def text_stemming(text):
    st = PorterStemmer()
    text = [st.stem(word) for word in text]
    return text

# lemmatization
def text_lemmatization(text):
    lm = WordNetLemmatizer()
    text = [lm.lemmatize(word) for word in text]
    return text

In [6]:
# defining preprocess function

def preprocess(text):
    text = text_lower(text)
    text = text.apply(lambda text: clean_stopwords(text))
    text = text.apply(lambda x : clean_puctuations(x))
    text = text.apply(lambda x: clean_repeating_characters(x))
    text = text.apply(lambda x : clean_URLs(x))
    text = text.apply(lambda x: clean_numeric(x))
    text = tokenize_tweet(text)
    text = text.apply(lambda x: text_stemming(x))
    text = text.apply(lambda x: text_lemmatization(x))
    text = text.apply(lambda x : " ".join(x))
    return text

data['tweet_text'] = preprocess(data['tweet_text'])
data

Unnamed: 0,tweet_text,cyberbullying_type,cyberbullying_type_encoded
0,rt colonelkickhead anoth bloodi instant restau...,not_cyberbullying,1
1,azzamalirhabi jihadia video peshmerga decim is...,not_cyberbullying,1
2,oh realli no instant restaur that shock mkr mkr,not_cyberbullying,1
3,rt benfrancisallen hasnt good week isi new fro...,not_cyberbullying,1
4,rt notofemin donâ t need femisnsn men carri he...,not_cyberbullying,1
...,...,...,...
71975,answer word word ur upset ur talk bridg build ...,cyberbullying,0
71976,imsoohaitian haitian fuck ugli nasti black fuc...,cyberbullying,0
71977,thareal no nigger idol wish white fuck outta h...,cyberbullying,0
71978,razbk heard got knock fuck lol dumb as nigger ...,cyberbullying,0


In [7]:
data.to_csv("cleaned_data.csv",index=False)

#### Model Creation

In [8]:
# Splitting the data into train and test
X, y = data['tweet_text'], data['cyberbullying_type_encoded']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state= 41)

In [10]:
# Transforming the data using TF-IDF Vectorizer
vectoriser = TfidfVectorizer(ngram_range=(1,2), max_features= 5000000)
vectoriser.fit(X_train)
# print("No. of feature words: ",len(vectoriser.get_feature_names()))

In [11]:
# Dumping the vectoriser
pickle.dump(vectoriser, open('tdf_vectorizer', 'wb'))

In [12]:
X_train = vectoriser.transform(X_train)
X_test = vectoriser.transform(X_test)

In [13]:
# Model
svm_model_linear = SVC(kernel= 'linear', C = 1).fit(X_train, y_train)
svm_predictions  = svm_model_linear.predict(X_test)
accuracy = svm_model_linear.score(X_test, y_test)
print(accuracy)

0.8619987033435214


In [14]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
def models(X_train,Y_train):
  
  #Using RandomForestClassifier()
  rf=RandomForestClassifier()
  rf.fit(X_train,Y_train)
   #Using KNeighborsClassifier 
  knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
  knn.fit(X_train, Y_train)

  #Using DecisionTreeClassifier 
  tree = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
  tree.fit(X_train, Y_train)
    
  
  #print model accuracy on the training data.
  print('[1]K Nearest Neighbor Training Accuracy:', knn.score(X_train, Y_train))
  print('[2]Decision Tree Classifier Training Accuracy:', tree.score(X_train, Y_train))
  print('[3]Random Forest Classifier Training Accuracy:', rf.score(X_train, Y_train))
  
  
  
  return  rf,knn,tree

In [15]:
model=models(X_test, y_test)

[1]K Nearest Neighbor Training Accuracy: 0.9622117254792998
[2]Decision Tree Classifier Training Accuracy: 0.9723071223488006
[3]Random Forest Classifier Training Accuracy: 0.9723071223488006


In [16]:
# dumping the model
pickle.dump(svm_model_linear, open('model.bin', 'wb'))

In [17]:
#https://github.com/UB-Mannheim/tesseract/wiki

In [18]:
#pickle.dump(model[0], open('model.bin', 'wb'))

#### Predicting Custom input

In [19]:
# Function for custom input prediction
def custom_input_prediction(text):
    import nltk
    nltk.download('omw-1.4')
    text = pd.Series(text)
    text = preprocess(text)
    text = [text[0],]
    vectoriser = pickle.load(open("tdf_vectorizer", "rb"))
    text = vectoriser.transform(text)
    model = pickle.load(open("model.bin", "rb"))
    prediction = model.predict(text)
    prediction = prediction[0]

    interpretations = {
        0 : "cyberbullying",
        1 : "not_cyberbullying"
    }

    for i in interpretations.keys():
        if i == prediction:
            return interpretations[i]

In [20]:
something = "My Grandsons are angry about this gender free crap too! 2 in primary 2 @at high school T.he is 16 yr old ASD &amp; got bullied as did a girl in his SEN base. He had to step in as teachers to busy on phones playing games, wee lass would have had nowhere to run if loos unisex!"
something_2 = "But for u its Hinduphobia isnt it? When kashmiri pandits get killed, when a hindu girl gets raped by islamists, when radical islamic terrorism kill people in the world,u still keep quiet as if nothing is happening;but jump on when some1 says anything against islam!! #Hinduphobic"
new_something = "There was certainly a more ""acceptable"" time for them to be made though in the eyes of our world at large (which also includes other jokes like rape, gaybashing, etc.) Shit, try watching Friends or Seinfeld and watch how many times they throw gay people under the bus for a laugh."
print(custom_input_prediction(new_something))

cyberbullying


[nltk_data] Error loading omw-1.4: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:997)>


In [21]:
!pip install opencv-python
!pip install pytesseract
!pip install tesseract


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [27]:
# text recognition
import cv2
import pytesseract

# read image
img = cv2.imread('sample_image.jpeg')

# configurations
con = ('-l eng --oem 1 --psm 3')

# pytessercat
pytesseract.pytesseract.tesseract_cmd = '/Users/jahnavipolasa/Downloads/cyberbullying-tweet-recognition-app-main/tesseract-5.3.0'
text = pytesseract.image_to_string(img, config=con)

# print text
#text = text.split('\n')
print(text)

PermissionError: [Errno 13] Permission denied: '/Users/jahnavipolasa/Downloads/cyberbullying-tweet-recognition-app-main/tesseract-5.3.0'

In [23]:
print(custom_input_prediction("stop justin bieber gay gener racist prison rape joke he given u plenti remark onlaugh but rape isnt it"))

cyberbullying


[nltk_data] Error loading omw-1.4: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:997)>


In [24]:
import random
''.join(random.choice('0123456789ABCDEF') for i in range(9))


'4E0A8935B'