In [130]:
import pandas as pd
import nltk
import os
import string
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [48]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/tiger/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/tiger/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [65]:
def load_file(file_name):
    data = pd.read_csv(file_name)
    print(data.head())
    return data

In [66]:
train_file = os.path.join('datasets','train.csv')
test_file = os.path.join('datasets','test.csv')

In [67]:
train_data = load_file(train_file)
test_data = load_file(test_file)

   id keyword location                                               text  \
0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   
2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   
3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   
4   7     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...   

   target  
0       1  
1       1  
2       1  
3       1  
4       1  
   id keyword location                                               text
0   0     NaN      NaN                 Just happened a terrible car crash
1   2     NaN      NaN  Heard about #earthquake is different cities, s...
2   3     NaN      NaN  there is a forest fire at spot pond, geese are...
3   9     NaN      NaN           Apocalypse lighting. #Spokane #wildfires
4  11     NaN      NaN      Typhoon Soudelor kills 28 in China and Taiwan


In [88]:
class CommonDataProcess:
    def __init__(self):
        self.stop_words = [w.lower() for w in stopwords.words('english')]
        self.lemmatizer = WordNetLemmatizer()
    def remove_digits(self, text):
        text = re.sub('[0-9]',"",text)
        return text
    def lower_case(self, text):
        return text.lower()
    def word_tokenize_func(self, text):
        text = word_tokenize(text)
        return text
    def lemmatize_func(self, words):
        save = []
        for w in words:
            save.append(lemmatizer.lemmatize(w))
        return save
    def remove_stop_words(self, words):
        save = []
        for i in words:
            if i not in self.stop_words:
                save.append(i)
        return save
    def punctuation_remove(self, text):
        text = re.sub("[!|\"|#|$|%|&|\'|\(|\)|*|+|,|-|.|/|:|;|<|=|>|?|@|\[|\\|\]|^|_|\`|\{|\||\}|~]", " ", text)
        return text
    def process_text(self, text):
        input_text = text
        text = self.lower_case(text)
        text = self.remove_digits(text)
        text = self.punctuation_remove(text)
        words = self.word_tokenize_func(text)
        words = self.lemmatize_func(words)
        words = self.remove_stop_words(words)
        # print(words, input_text)
        return words
        
comm = CommonDataProcess()
# print(comm.process_text('i, study. in america? 911'))

In [102]:
def make_corpus():
    corpus = []
    all_text = train_data['text'].tolist() + test_data['text'].tolist()
    for i in range(len(all_text)):
        clean = comm.process_text(all_text[i])
        clean = " ".join([w for w in clean])
        clean = clean.strip()
        corpus.append(clean)
    return corpus

In [103]:
def apply_tfidf(corpus):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(corpus)
    print(len(vectorizer.get_feature_names()))
    print(X.shape)
    return vectorizer, X

In [104]:
corpus= make_corpus()
print(len(corpus))
print(len(train_data.index), len(test_data.index))

10876
7613 3263


In [121]:
vectorizer, X = apply_tfidf(corpus=corpus)
X_arr = X.toarray()

25393
(10876, 25393)


In [128]:
idx_map = {}
tf_idf_weight={}
for w in vectorizer.vocabulary_:
    idx_map[vectorizer.vocabulary_[w]] = w
# print(idx_map)
for row in X:
    indices = row.indices
    weight = row.data
    print(indices)
    print(weight)
    for i in range(0, len(indices)):
        if tf_idf_weight.get(indices[i]) is None:
            tf_idf_weight[indices[i]] = weight[i]
        else:
            print(indices[i], tf_idf_weight[indices[i]], weight[i])
            assert(tf_idf_weight[indices[i]] == weight[i])

[ 7543   584 13313  5968 17813  4906]
[0.47724964 0.42914156 0.30689576 0.3338231  0.35650945 0.50532014]
[ 2966 18939 18527 12039 14546  7286  7534]
[0.38575143 0.5002272  0.5002272  0.35261631 0.30611794 0.21457655
 0.29436673]
[ 6796 15667  6667 15337 14973 16508 19438  1140 18128]
[0.2571909  0.22129414 0.21106286 0.2286107  0.33596837 0.45886817
 0.56667504 0.26787685 0.26787685]
[ 2914 23898 17839 16217 15667  6667]
[0.34118418 0.3614614  0.58022411 0.30373414 0.41191709 0.39287259]
15667 0.22129413712583343 0.4119170913801556


AssertionError: 

In [132]:
print(type(X_arr))
X_train = X_arr[0:len(train_data.index)]
X_test = X_arr[len(train_data.index):]
Y_train = train_data['target']
#text = X_arr[0:len(tes)]

<class 'numpy.ndarray'>
0       1
1       1
2       1
3       1
4       1
       ..
7608    1
7609    1
7610    1
7611    1
7612    1
Name: target, Length: 7613, dtype: int64


In [133]:
clf = LogisticRegression(random_state=0).fit(X_train, Y_train)

In [136]:
Y_test = clf.predict(X_test)
print(Y_test)

[1 1 1 ... 1 1 0]


In [137]:
dic={0:0,1:0}
for i in range(0, len(Y_test)):
    dic[Y_test[i]] =  dic[Y_test[i]]+1
print(dic)

{0: 2238, 1: 1025}


In [139]:
res_list = test_data['id']
pd_data = {'id':res_list, 'target':Y_test}
df = pd.DataFrame(pd_data)  