# There is a Kaggle course on NLP

In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('nlp').getOrCreate()

In [82]:
from pyspark.sql.functions import lower, col, udf, regexp_replace
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.sql.types import ArrayType, StringType

from pyspark.ml.classification import RandomForestClassifier, NaiveBayes, LinearSVC
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.pipeline import Pipeline

In [3]:
df = spark.read.csv('disaster_tweet_train.csv', header=True, inferSchema=True)
df.show(4)

+---+-------+--------+--------------------+------+
| id|keyword|location|                text|target|
+---+-------+--------+--------------------+------+
|  1|   null|    null|Our Deeds are the...|     1|
|  4|   null|    null|Forest fire near ...|     1|
|  5|   null|    null|All residents ask...|     1|
|  6|   null|    null|13,000 people rec...|     1|
+---+-------+--------+--------------------+------+
only showing top 4 rows



In [4]:
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- keyword: string (nullable = true)
 |-- location: string (nullable = true)
 |-- text: string (nullable = true)
 |-- target: integer (nullable = true)



In [5]:
df.count(), len(df.columns)

(8387, 5)

In [6]:
df = df.dropna()
df.count(), len(df.columns)

(4771, 5)

In [7]:
df.groupby('target').count().show()

+------+-----+
|target|count|
+------+-----+
|     1| 2062|
|     0| 2709|
+------+-----+



In [8]:
df = df.select('text', 'target')
df.show(4)

+--------------------+------+
|                text|target|
+--------------------+------+
|@bbcmtd Wholesale...|     1|
|We always try to ...|     0|
|#AFRICANBAZE: Bre...|     1|
|Crying out for mo...|     0|
+--------------------+------+
only showing top 4 rows



# Preprocessing 

In [9]:
def preprocess(column):
    
    # Lowering the capital letters 
    column = lower(column)
    
    # Replacing user name 
    column = regexp_replace(column, r'@[^\s]+', 'USER')
    
    # Replacing url 
    column = regexp_replace(column, r'https?://\S+|www\.\S+', 'URL')
    
    # Replacing ayyyyy -> ayy 
    column = regexp_replace(column, r'(.)\1\1+', r'\1\1')
    
    # Replacing other than alphabets 
    column = regexp_replace(column, r'[^a-zA-Z\d\s]', '')
    column = regexp_replace(column, r'\d+', 'NUM')
    
    return column

In [10]:
df = df.withColumn('text', preprocess(col('text')))
df.show(5)

+--------------------+------+
|                text|target|
+--------------------+------+
|USER wholesale ma...|     1|
|we always try to ...|     0|
|africanbaze break...|     1|
|crying out for mo...|     0|
|on plus side look...|     0|
+--------------------+------+
only showing top 5 rows



In [56]:
train, test = df.randomSplit([0.7, 0.3])

# Tokenization, removing stop words and Stemming 

In [11]:
tokenizer = Tokenizer(inputCol='text', outputCol='text_token')
remover = StopWordsRemover(inputCol='text_token', outputCol='text')

In [12]:
df = tokenizer.transform(df).drop('text')
df = remover.transform(df).drop('text_token')
df.show(5)

+------+--------------------+
|target|                text|
+------+--------------------+
|     1|[user, wholesale,...|
|     0|[always, try, bri...|
|     1|[africanbaze, bre...|
|     0|[crying, set, abl...|
|     0|[plus, side, look...|
+------+--------------------+
only showing top 5 rows



In [13]:
print('Total number of stopwords: \t', len(StopWordsRemover().getStopWords()))
print('Example stopwords: \t\t', StopWordsRemover().getStopWords()[:5])

Total number of stopwords: 	 181
Example stopwords: 		 ['i', 'me', 'my', 'myself', 'we']


# Vectorization 

In [16]:
hashingTF = HashingTF(inputCol='text', outputCol='raw_feat', numFeatures=1000)
idf = IDF(inputCol='raw_feat', outputCol='features')

In [17]:
df = hashingTF.transform(df)
idf_model = idf.fit(df)
df = idf_model.transform(df).select('target', 'features')
df.show(4)

+------+--------------------+
|target|            features|
+------+--------------------+
|     1|(10000,[2025,3862...|
|     0|(10000,[426,3057,...|
|     1|(10000,[1527,3714...|
|     0|(10000,[5342,7657...|
+------+--------------------+
only showing top 4 rows



# Model building 

In [53]:
evaluator = MulticlassClassificationEvaluator(labelCol='target', metricName='accuracy')

In [99]:
rf = RandomForestClassifier(labelCol='target', 
                            maxDepth=16, 
                            numTrees=100)

model = rf.fit(train)
pred = model.transform(test)
evaluator.evaluate(pred)

0.6798365122615804

In [98]:
nb = NaiveBayes(labelCol='target', smoothing=200)

model = nb.fit(train)
pred = model.transform(test)
evaluator.evaluate(pred)

0.771117166212534

In [97]:
svc = LinearSVC(labelCol='target', regParam=1, maxIter=20)

model = svc.fit(train)
pred = model.transform(test)
evaluator.evaluate(pred)

0.7411444141689373

# Future direction 

- Pipeline 
- Emoji and emoticons 
- Lementation 