In [None]:
pip freeze > requirements.txt

In [None]:
pip install matplotlib

In [None]:
pip install seaborn

In [None]:
pip list

In [1]:
import re
import pandas as pd
import numpy as np
import sqlite3
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

In [2]:
data = pd.read_csv('data.csv', encoding='latin-1')
alay_dict = pd.read_csv('new_kamusalay.csv', encoding='latin-1', header=None)
alay_dict = alay_dict.rename(columns={0: 'original', 
                                      1: 'replacement'})
abusive_dict = pd.read_csv('abusive.csv', encoding='latin-1')

In [None]:
from flask import request, Flask, jsonify
from flasgger import Swagger, LazyString, LazyJSONEncoder, swag_from

app = Flask(__name__)

app.json_encoder = LazyJSONEncoder
swagger_template = dict(
info = {
    'title': LazyString(lambda: 'API Documentation for Data Processing and Modeling'),
    'version': LazyString(lambda: '1.0.0'),
    'description': LazyString(lambda: 'Dokumentasi API untuk Data Processing and Modeling')
    },
    host = LazyString(lambda: request.host)
)

swagger_config = {
    "headers": [],
    "specs": [
        {
            "endpoint": 'docs',
            "route": '/docs.json'
        }
    ],
    "static_url_path": "/flasgger_static",
    "swagger_ui": True,
    "specs_route": "/docs/"
}
swagger = Swagger(app, template=swagger_template,
                 config=swagger_config)

# DEFINE ENDPOINTS: BASIC GET
@swag_from("C:/Users/nde/Downloads/Binar Gold Challenge/docs/hello_world.yml", methods=['GET'])
@app.route('/', methods=['GET'])
def hello_world():
    json_response = {
        'status_code': 200,
        'description': "Menyapa Hello World",
        'data': "Hello World"
    }
    response_data=jsonify(json_response)
    return response_data

# DEFINE ENDPOINTS: POST FOR TEXT PROCESSING FROM TEXT INPUT
@swag_from("C:/Users/nde/Downloads/Binar Gold Challenge/docs/text_processing.yml", methods=['POST'])
@app.route('/text-processing', methods=['POST'])
def text_processing():
    global text, new_list
    text = request.form.get('text')
    text = re.sub('\n',' ',text) 
    text = re.sub('rt',' ',text)
    text = re.sub('RT',' ',text)
    text = re.sub('user',' ',text)
    text = re.sub('USER',' ',text)
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))',' ',text) 
    text = re.sub('  +', ' ', text)
    text = re.sub('[^0-9a-zA-Z]+', ' ', text)
    text = ' '.join(['' if word in abusive_dict.ABUSIVE.values else word for word in text.split(' ')])
    
    alay_dict_map = dict(zip(alay_dict['original'], alay_dict['replacement']))
    return ' '.join([alay_dict_map[word] if word in alay_dict_map else word for word in text.split(' ')])

    
    json_response = {
        'status_code': 200,
        'description': "Teks yang baru saja diinput",
        'data': text
    }
    
    response_data=jsonify(json_response)
    return response_data
       
# DEFINE ENDPOINTS: POST FOR TEXT PROCESSING FROM FILE
@swag_from("C:/Users/nde/Downloads/Binar Gold Challenge/docs/file_processing.yml", methods=['POST'])
@app.route('/text-processing-file', methods=['POST'])
def text_processing_file():
    global post_df
    
    # USING REQUEST TO GET FILE THAT HAS BEEN POSTED FROM API ENDPOINT
    file = request.files['file']
    
    # IMPORT FILE OBJECT INTO PANDAS DATAFRAME (YOU CAN SPECIFY NUMBER OF ROWS IMPORTED USING PARAMETER nrows=(integer value) )
    post_df = pd.read_csv(file, encoding='latin-1')
    
    # SET THE TWEET COLUMN ONLY FOR THE DATAFRAME
    post_df = post_df[['Tweet']]
    
    # DROP DUPLICATED TWEETS
    post_df.drop_duplicates(inplace=True)
    
    # CREATE NEW NUMBER OF CHARACTERS (NO_CHAR) COLUMN THAT CONSISTS OF LENGTH OF TWEET CHARACTERS
    post_df['no_char'] = post_df['Tweet'].apply(len)
    
    # CREATE NEW NUMBER OF WORDS (NO_WORDS) COLUMN THAT CONSISTS OF NUMBER OF WORDS OF EACH TWEET
    post_df['no_words'] = post_df['Tweet'].apply(lambda x: len(x.split()))
    
    # CREATE A FUNCTION TO CLEAN DATA FROM ANY NON ALPHA-NUMERIC (AND NON-SPACE) CHARACTERS, AND STRIP IT FROM LEADING/TRAILING SPACES
    def tweet_cleansing(x):
        tweet = x
        cleaned_tweet = re.sub(r'[^a-zA-Z0-9 ]','',tweet).strip()
        return cleaned_tweet
    
    # APPLY THE TWEET_CLEANSING FUNCTION ON TWEET COLUMN, AND CREATE A NEW CLEANED_TWEET COLUMN
    post_df['cleaned_tweet'] = post_df['Tweet'].apply(lambda x: tweet_cleansing(x))
    
    # CREATE NEW NO_CHAR, AND NO_WORDS COLUMNS BASED ON CLEANED_TWEET COLUMN
    post_df['no_char_2'] = post_df['cleaned_tweet'].apply(len)
    post_df['no_words_2'] = post_df['cleaned_tweet'].apply(lambda x: len(x.split()))
    
    # CREATE A FUNCTION TO COUNT NUMBER OF ABUSIVE WORDS FOUND IN A CLEANED TWEET
    def count_abusive(x):
        cleaned_tweet = x
        matched_list = []
        for i in range(len(abusive_dict)):
            for j in x.split():
                word = abusive_dict['ABUSIVE'].iloc[i]
                if word==j.lower():
                    matched_list.append(word)
        return len(matched_list)
    
    # APPLY THE FUNCTION TO COUNT ABUSIVE WORDS, AND CREATE A NEW COLUMN BASED OFF OF IT
    post_df['estimated_no_abs_words'] = post_df['cleaned_tweet'].apply(lambda x: count_abusive(x))
    
    # CONNECT / CREATE NEW DATABASE AND CREATE NEW TABLE CONSISTING LISTED TABLES
    conn = sqlite3.connect('database_project.db')
     
    # DO ITERATIONS TO INSERT DATA (EACH ROW) FROM FINAL DATAFRAME (POST_DF)
    for i in range(len(post_df)):
        tweet = post_df['Tweet'].iloc[i]
        no_char = int(post_df['no_char'].iloc[i])
        no_words = int(post_df['no_words'].iloc[i])
        cleaned_tweet = post_df['cleaned_tweet'].iloc[i]
        no_char_2 = int(post_df['no_char_2'].iloc[i])
        no_words_2 = int(post_df['no_words_2'].iloc[i])
    
        q_insertion = "insert into post_df (Tweet, no_char, no_words, cleaned_tweet, no_char_2, no_words_2) values (?,?,?,?,?,?)"
        conn.execute(q_insertion,(tweet,no_char,no_words,cleaned_tweet,no_char_2,no_words_2))
        conn.commit()
        
    conn.close()
    
    # OUTPUT THE RESULT IN JSON FORMAT
    json_response = {
        'status_code': 200,
        'description': "Teks yang sudah diproses",
        'data': list(post_df['cleaned_tweet'])
    }
        
    response_data=jsonify(json_response)
    return response_data
    
    # VISUALIZE THE NUMBER OF ABUSIVE WORDS USING BARPLOT (COUNTPLOT)
    plt.figure(figsize=(10,7))
    countplot = sns.countplot(data=post_df, x="estimated_no_abs_words")
    for p in countplot.patches:
        countplot.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()),  ha = 'center'
                            , va = 'center', xytext = (0, 10), textcoords = 'offset points')

    %matplotlib inline
    warnings.filterwarnings('ignore', category=FutureWarning)

    plt.title('Count of Estimated Number of Abusive Words')
    plt.xlabel('Estimated Number of Abusive Words')
    plt.savefig('new_countplot.jpeg')
    
    plt.figure(figsize=(20,4))
    boxplot = sns.boxplot(data=post_df, x="no_words_2")

    print()
    
    # VISUALIZE THE NUMBER OF WORDS USING BOXPLOT
    %matplotlib inline
    warnings.filterwarnings('ignore', category=FutureWarning)

    plt.title('Number of Words Boxplot (after tweet cleansing)')
    plt.xlabel('')

    plt.savefig('new_boxplot.jpeg')
    
    
if __name__ == "__main__":
    app.run()

 * Serving Flask app '__main__' (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [28/Sep/2023 19:54:14] "GET /docs/ HTTP/1.1" 200 -
127.0.0.1 - - [28/Sep/2023 19:54:16] "GET /flasgger_static/swagger-ui-bundle.js HTTP/1.1" 304 -
127.0.0.1 - - [28/Sep/2023 19:54:16] "GET /flasgger_static/swagger-ui-standalone-preset.js HTTP/1.1" 304 -
127.0.0.1 - - [28/Sep/2023 19:54:16] "GET /flasgger_static/swagger-ui.css HTTP/1.1" 304 -
127.0.0.1 - - [28/Sep/2023 19:54:16] "GET /flasgger_static/lib/jquery.min.js HTTP/1.1" 304 -
127.0.0.1 - - [28/Sep/2023 19:54:17] "GET /docs.json HTTP/1.1" 200 -
127.0.0.1 - - [28/Sep/2023 20:05:39] "POST /text-processing-file HTTP/1.1" 200 -
127.0.0.1 - - [28/Sep/2023 20:10:07] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [28/Sep/2023 20:10:27] "POST /text-processing HTTP/1.1" 200 -


In [None]:
def lowercase(text):
    return text.lower()

def remove_unnecessary_char(text):
    text = re.sub('\n',' ',text) # Remove every '\n'
    text = re.sub('rt',' ',text) # Remove every retweet symbol
    text = re.sub('RT',' ',text)
    text = re.sub('user',' ',text) # Remove every username
    text = re.sub('USER',' ',text)
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))',' ',text) # Remove every URL
    text = re.sub('  +', ' ', text) # Remove extra spaces
    return text
    
def remove_nonaplhanumeric(text):
    text = re.sub('[^0-9a-zA-Z]+', ' ', text) 
    return text

alay_dict_map = dict(zip(alay_dict['original'], alay_dict['replacement']))
def normalize_alay(text):
    return ' '.join([alay_dict_map[word] if word in alay_dict_map else word for word in text.split(' ')])

def remove_abusive(text):
    text = ' '.join(['' if word in abusive_dict.ABUSIVE.values else word for word in text.split(' ')])
    text = re.sub('  +', ' ', text) # Remove extra spaces
    text = text.strip()
    return text

print("remove_nonaplhanumeric: ", remove_nonaplhanumeric("Halooo,,,,, duniaa!!"))
print("lowercase: ", lowercase("Halooo, duniaa!"))
print("remove_unnecessary_char: ", remove_unnecessary_char("Hehe\n\n RT USER USER apa kabs www.google.com\n  hehe"))
print("normalize_alay: ", normalize_alay("aamiin adek abis"))
print("remove_abusive: ", remove_abusive("anak anjing"))

In [None]:
# VISUALIZE THE NUMBER OF ABUSIVE WORDS USING BARPLOT (COUNTPLOT)
    plt.figure(figsize=(10,7))
    countplot = sns.countplot(data=post_df, x="estimated_no_abs_words")
    for p in countplot.patches:
        countplot.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()),  ha = 'center'
                            , va = 'center', xytext = (0, 10), textcoords = 'offset points')

    %matplotlib inline
    warnings.filterwarnings('ignore', category=FutureWarning)

    plt.title('Count of Estimated Number of Abusive Words')
    plt.xlabel('Estimated Number of Abusive Words')
    plt.savefig('new_countplot.jpeg')
    
    plt.figure(figsize=(20,4))
    boxplot = sns.boxplot(data=post_df, x="no_words_2")

    print()
    
    # VISUALIZE THE NUMBER OF WORDS USING BOXPLOT
    %matplotlib inline
    warnings.filterwarnings('ignore', category=FutureWarning)

    plt.title('Number of Words Boxplot (after tweet cleansing)')
    plt.xlabel('')
    plt.savefig('new_boxplot.jpeg')

In [None]:
#Untuk mengetahui ada berapa banyak ujaran toxic dan tidak 
print("Toxic shape: ", data[(data['HS'] == 1) | (data['Abusive'] == 1)].shape)
print("Non-toxic shape: ", data[(data['HS'] == 0) & (data['Abusive'] == 0)].shape)