<a href="https://colab.research.google.com/github/ssrbsoni/Tweet-Sentiments/blob/main/tutorials/streamlit_notebooks/SENTIMENT_EN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Find sentiment in text**

## 1. Colab Setup

In [11]:
!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash
# !bash colab.sh
# -p is for pyspark
# -s is for spark-nlp
# !bash colab.sh -p 3.1.1 -s 3.0.1
# by default they are set to the latest

--2021-11-16 17:49:03--  http://setup.johnsnowlabs.com/colab.sh
Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125
Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://setup.johnsnowlabs.com/colab.sh [following]
--2021-11-16 17:49:04--  https://setup.johnsnowlabs.com/colab.sh
Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]
--2021-11-16 17:49:04--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:44

In [85]:
! pip install unidecode wordninja 

Collecting unidecode
  Downloading Unidecode-1.3.2-py3-none-any.whl (235 kB)
[K     |████████████████████████████████| 235 kB 7.4 MB/s 
[?25hCollecting wordninja
  Downloading wordninja-2.0.0.tar.gz (541 kB)
[K     |████████████████████████████████| 541 kB 34.5 MB/s 
[?25hBuilding wheels for collected packages: wordninja
  Building wheel for wordninja (setup.py) ... [?25l[?25hdone
  Created wheel for wordninja: filename=wordninja-2.0.0-py3-none-any.whl size=541551 sha256=e796e7de57e9abe3279ee7639c6733d3dd68af5469b891098dbe70f3ab3240ca
  Stored in directory: /root/.cache/pip/wheels/dd/3f/eb/a2692e3d2b9deb1487b09ba4967dd6920bd5032bfd9ff7acfc
Successfully built wordninja
Installing collected packages: wordninja, unidecode
Successfully installed unidecode-1.3.2 wordninja-2.0.0


In [96]:
import pandas as pd
import numpy as np
import re
import unidecode
import wordninja
import json
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp
from sparknlp.pretrained import PretrainedPipeline

## 2. Start Spark Session

In [13]:
spark = sparknlp.start()

In [106]:
document_assembler = DocumentAssembler()\
                        .setInputCol("text")\
                        .setOutputCol("document")
encoder = UniversalSentenceEncoder.pretrained("tfhub_use", "en")\
                     .setInputCols(["document"])\
                     .setOutputCol("embeddings")

clf = ClassifierDLApproach()\
          .setInputCols(["embeddings"])\
          .setOutputCol("prediction")\
          .setLabelColumn("label")\
          .setMaxEpochs(30)\
          .setBatchSize(32)

# Create the pipeline with all the transformers above.
pipeline = Pipeline(
    stages = [
        document_assembler,
        encoder,
        clf
    ])

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]


In [104]:
trainPath = 'train_twit.csv'
testPath = 'test_twit.csv'

train_df = pd.read_csv(trainPath)
test_df = pd.read_csv(testPath)

In [82]:
def clean_tweet(text):
    
    # lower-case all characters
    text=text.lower()
    
    # remove twitter handles
    text= re.sub(r'@\S+', '',text) 
    
    # remove urls
    text= re.sub(r'http\S+', '',text) 
    text= re.sub(r'pic.\S+', '',text)
      
    # replace unidecode characters
    text=unidecode.unidecode(text)
      
    # regex only keeps characters
    text= re.sub(r"[^a-zA-Z+']", ' ',text)
    
    # keep words with length>1 only
    text=re.sub(r'\s+[a-zA-Z]\s+', ' ', text+' ') 

    # split words like 'whatisthis' to 'what is this'
    def preprocess_wordninja(sentence):      
        def split_words(x):
            x=wordninja.split(x)
            x= [word for word in x if len(word)>1]
            return x
        new_sentence=[ ' '.join(split_words(word)) for word in sentence.split() ]
        return ' '.join(new_sentence)
    
    text=preprocess_wordninja(text)
    
    # regex removes repeated spaces, strip removes leading and trailing spaces
    text= re.sub("\s[\s]+", " ",text).strip()  
    
    return text

In [105]:
train_df['text']=train_df['tweet'].apply(lambda x: clean_tweet(x))
test_df['text']=test_df['tweet'].apply(lambda x: clean_tweet(x))

In [107]:
trainData = spark.createDataFrame(train_df).withColumn("label", F.col("label").cast(T.IntegerType())).select("text", "label")
testData = spark.createDataFrame(test_df).select("id", "text")

In [108]:
trainData.dtypes

[('text', 'string'), ('label', 'int')]

In [109]:
train = trainData.select("text","label")

model = pipeline.fit(train)

In [110]:
final = model.transform(testData)
final = final.select("id","prediction.result").toPandas()

In [111]:
final.result = final.result.apply(lambda x: x[0]).astype(int)

In [113]:
sub = final[["id","result"]]
sub.columns = ["id","label"]
sub.to_csv("submission_ver1.csv", index=None)

In [114]:
sub.head()

Unnamed: 0,id,label
0,7921,1
1,7922,0
2,7923,1
3,7924,0
4,7925,1
