In [4]:
import pandas as pd
import numpy as np
from sklearn import model_selection, preprocessing, feature_extraction, linear_model
import pickle

In [5]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [6]:
test_data.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [7]:
train_data[train_data['target']==0]['text'].values[1]

'I love fruits'

In [8]:
train_data[train_data['target']==1]['text'].values[1]

'Forest fire near La Ronge Sask. Canada'

In [9]:
count_vectorizer = feature_extraction.text.CountVectorizer()

## let's get counts for 5th to 10th tweets in the data
example_train_vectors = count_vectorizer.fit_transform(train_data['text'][5:10])

In [10]:
## .todense() is used as the vectors are "sparse" (only non-zero elements are kept to save space)

print(example_train_vectors[0].todense().shape)
print(example_train_vectors[0].todense())
print(example_train_vectors[0])

(1, 53)
[[1 0 0 0 0 0 0 1 0 1 1 0 0 1 0 0 1 1 0 1 0 0 1 0 0 0 0 0 0 1 1 0 1 0 0 0
  0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 1 0]]
  (0, 39)	1
  (0, 50)	1
  (0, 10)	1
  (0, 29)	1
  (0, 0)	1
  (0, 13)	1
  (0, 30)	1
  (0, 7)	1
  (0, 17)	1
  (0, 19)	1
  (0, 47)	1
  (0, 32)	1
  (0, 16)	1
  (0, 22)	1
  (0, 9)	1
  (0, 51)	1


In [11]:
train_vectors = count_vectorizer.fit_transform(train_data['text'])

## here fit_transform is not used. .transform() is used to make sure that the test and train data is mapped to the same vectors
test_vectors = count_vectorizer.transform(test_data['text'])

In [12]:
## we are considering that the presence of particular words in a tweet are a direct correlation to whether the information is real or not. So we can see a linear relationship between them. Since the number of observations is less than the number of words/tokens we will use Ridge regression to give proper weights to words
rclf = linear_model.RidgeClassifier()

In [13]:
scores = model_selection.cross_val_score(rclf, train_vectors, train_data['target'], cv=3, scoring='f1')
scores

array([0.59421842, 0.56498283, 0.64051005])

In [14]:
rclf.fit(train_vectors, train_data['target'])

RidgeClassifier()

In [15]:
sample_submission = pd.read_csv('sample_submission.csv')

In [16]:
sample_submission['target'] = rclf.predict(test_vectors)

In [17]:
sample_submission.head()

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,1
3,9,0
4,11,1


In [18]:
sample_submission.to_csv('submission.csv', index=False)

In [23]:
## open the file where you want to store the data
file = open('disaster_prediction_model.pkl', 'wb')

## dump information into that file
pickle.dump(rclf, file)
file.close()