In [25]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import nltk
stopwords = nltk.corpus.stopwords.words('english')
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier,LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import string
from nltk.stem import WordNetLemmatizer
lemm = WordNetLemmatizer()
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split

In [2]:
# read the file that was tagged by all of us
df = pd.read_csv("./datasets/text_data.csv", encoding = "ISO-8859-1")

In [3]:
df.columns

Index(['ID', 'File', 'Page', 'Text', 'Context', 'Unnamed: 5', 'Unnamed: 6'], dtype='object')

In [4]:
# see how many context 
df['Context'].unique()

array(['Cover page', 'Carbon Neutrality', 'Climate change adaptation',
       'Undefined', 'Acknowledgments', 'Contents', 'Bibliography',
       'Glossary', 'Appendix', 'Flooding', 'General', 'Storm',
       'ice storm', 'Wildfire', 'Mitigation', 'Heat wave',
       'Climate model', 'glossary', 'bibliography', 'Wind Storm',
       'drought', 'cover page', nan], dtype=object)

In [28]:
# extract rows only from appropriate context
# only include climate related context
df2=df[df['Context'].isin(['Carbon Neutrality', 'Climate change adaptation','Flooding', 'General', 'Storm',
       'ice storm', 'Wildfire', 'Mitigation',  'Heat wave',
       'Wind Storm', 'drought'])]

In [6]:
# check how many records per climate related context
df2.groupby(['Context']).size()

Context
Carbon Neutrality             10
Climate change adaptation    708
Flooding                     282
General                       51
Heat wave                      5
Mitigation                    20
Storm                          9
Wildfire                      21
Wind Storm                    13
drought                        1
ice storm                     14
dtype: int64

In [29]:
# Not enough data to classify anything other than 'Climate change adaptation' and "Flooding"
# lets train a classify for each context
# Merge Storm, ice storm and Wind Storm as one context
# also merge drought and Heat Wave as one context
df2.loc[df2['Context'].isin(['Wind Storm','ice storm']),'Context']='Storm'
df2.loc[df2['Context']=='drought','Context']='Heat wave'
df2.groupby(['Context']).size()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Context
Carbon Neutrality             10
Climate change adaptation    708
Flooding                     282
General                       51
Heat wave                      6
Mitigation                    20
Storm                         36
Wildfire                      21
dtype: int64

In [30]:
df2.reset_index(drop=True,inplace=True)


In [9]:
# create function to use bag of words approach 
# use count vectorizer
class LemmaCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(LemmaCountVectorizer, self).build_analyzer()
        return lambda doc: (lemm.lemmatize(w) for w in analyzer(doc))

In [10]:
df2.index

RangeIndex(start=0, stop=1134, step=1)

In [31]:
# create training and test data set use stratified sampling
x_train, x_test, y_train, y_test = train_test_split(df2['Text'], df2['Context'], test_size=0.3)

In [32]:
x_train.reset_index(drop=True,inplace=True)
x_test.reset_index(drop=True,inplace=True)
y_train.reset_index(drop=True,inplace=True)
y_test.reset_index(drop=True,inplace=True)

In [37]:
# make sure y_train contains all the Context
(y_train.to_frame()).groupby(['Context']).size()

Context
Carbon Neutrality              7
Climate change adaptation    495
Flooding                     196
General                       35
Heat wave                      6
Mitigation                    17
Storm                         28
Wildfire                       9
dtype: int64

In [38]:
# make sure y_test contains all the Context
(y_test.to_frame()).groupby(['Context']).size()

Context
Carbon Neutrality              3
Climate change adaptation    213
Flooding                      86
General                       16
Mitigation                     3
Storm                          8
Wildfire                      12
dtype: int64

In [33]:
# create model using multinomial classifier
from sklearn.pipeline import Pipeline
pipeline_nb = Pipeline([
    ('bow', LemmaCountVectorizer(analyzer='word', stop_words='english')),
    ('tfidf', TfidfTransformer()),
    ('classifier', MultinomialNB())
])

In [34]:
pipeline_nb.fit(x_train,y_train)

Pipeline(memory=None,
         steps=[('bow',
                 LemmaCountVectorizer(analyzer='word', binary=False,
                                      decode_error='strict',
                                      dtype=<class 'numpy.int64'>,
                                      encoding='utf-8', input='content',
                                      lowercase=True, max_df=1.0,
                                      max_features=None, min_df=1,
                                      ngram_range=(1, 1), preprocessor=None,
                                      stop_words='english', strip_accents=None,
                                      token_pattern='(?u)\\b\\w\\w+\\b',
                                      tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('classifier',
                 MultinomialNB(alpha=1.0, class_prior=None,

In [35]:
pred_nb=pipeline_nb.predict(x_test)

In [16]:
# the f1-score is not that good
from sklearn.metrics import classification_report
print (classification_report(pred_nb, y_test))

                           precision    recall  f1-score   support

        Carbon Neutrality       0.00      0.00      0.00         0
Climate change adaptation       1.00      0.71      0.83       308
                 Flooding       0.37      0.88      0.52        33
                  General       0.00      0.00      0.00         0
                Heat wave       0.00      0.00      0.00         0
               Mitigation       0.00      0.00      0.00         0
                    Storm       0.00      0.00      0.00         0
                 Wildfire       0.00      0.00      0.00         0

                 accuracy                           0.72       341
                macro avg       0.17      0.20      0.17       341
             weighted avg       0.94      0.72      0.80       341



  'recall', 'true', average, warn_for)


In [39]:
# use SGD Classifier
pipeline_sga = Pipeline([
    ('bow', LemmaCountVectorizer(analyzer='word', stop_words='english')),
    ('tfidf', TfidfTransformer()),
    ('classifier', SGDClassifier())
])

In [40]:
# f1-score actually slightly better than Multinomial
pipeline_sga.fit(x_train,y_train)
pred_sga=pipeline_sga.predict(x_test)
print (classification_report(pred_sga, y_test))

                           precision    recall  f1-score   support

        Carbon Neutrality       1.00      1.00      1.00         3
Climate change adaptation       0.96      0.88      0.92       232
                 Flooding       0.84      0.87      0.85        83
                  General       0.25      0.67      0.36         6
               Mitigation       0.00      0.00      0.00         1
                    Storm       0.88      0.78      0.82         9
                 Wildfire       0.58      1.00      0.74         7

                 accuracy                           0.87       341
                macro avg       0.64      0.74      0.67       341
             weighted avg       0.91      0.87      0.89       341



In [41]:
# Use random forest
# f1-score is actually worse than SGD Classifier
pipeline_rf = Pipeline([
    ('bow', LemmaCountVectorizer(analyzer='word', stop_words='english')),
    ('tfidf', TfidfTransformer()),
    ('classifier', RandomForestClassifier(n_estimators = 100))
])

In [22]:
pipeline_rf.fit(x_train,y_train)
pred_rf=pipeline_rf.predict(x_test)
print (classification_report(pred_rf, y_test))

                           precision    recall  f1-score   support

        Carbon Neutrality       0.00      0.00      0.00         0
Climate change adaptation       0.98      0.81      0.89       263
                 Flooding       0.76      0.81      0.78        74
                  General       0.00      0.00      0.00         1
                Heat wave       0.00      0.00      0.00         0
               Mitigation       0.00      0.00      0.00         0
                    Storm       0.30      1.00      0.46         3
                 Wildfire       0.00      0.00      0.00         0

                 accuracy                           0.81       341
                macro avg       0.26      0.33      0.27       341
             weighted avg       0.92      0.81      0.86       341



  'recall', 'true', average, warn_for)


In [26]:
# use logistic regression
pipeline_lr = Pipeline([
    ('bow', LemmaCountVectorizer(analyzer='word', stop_words='english')),
    ('tfidf', TfidfTransformer()),
    ('classifier', LogisticRegression())
])

In [27]:
pipeline_lr.fit(x_train,y_train)
pred_lr=pipeline_rf.predict(x_test)
print (classification_report(pred_lr, y_test))



                           precision    recall  f1-score   support

        Carbon Neutrality       0.00      0.00      0.00         0
Climate change adaptation       0.98      0.81      0.89       263
                 Flooding       0.76      0.81      0.78        74
                  General       0.00      0.00      0.00         1
                Heat wave       0.00      0.00      0.00         0
               Mitigation       0.00      0.00      0.00         0
                    Storm       0.30      1.00      0.46         3
                 Wildfire       0.00      0.00      0.00         0

                 accuracy                           0.81       341
                macro avg       0.26      0.33      0.27       341
             weighted avg       0.92      0.81      0.86       341

