In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [2]:
train_data = pd.read_csv('train_2kmZucJ.csv')

In [3]:
train_data.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7920 entries, 0 to 7919
Data columns (total 3 columns):
id       7920 non-null int64
label    7920 non-null int64
tweet    7920 non-null object
dtypes: int64(2), object(1)
memory usage: 185.7+ KB


In [5]:
train_data['label'].value_counts()
#this will give the number of O's and 1's present

0    5894
1    2026
Name: label, dtype: int64

In [6]:
train_data.head()['tweet']

0    #fingerprint #Pregnancy Test https://goo.gl/h1...
1    Finally a transparant silicon case ^^ Thanks t...
2    We love this! Would you go? #talk #makememorie...
3    I'm wired I know I'm George I was made that wa...
4    What amazing service! Apple won't even talk to...
Name: tweet, dtype: object

In [7]:
## importing regular expression library ##
import re
def process_tweet(tweet):
    return " ".join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])", "",tweet.lower()).split())

In [8]:
train_data['tweet'].head(5).apply(process_tweet)

0    fingerprint pregnancy test httpsgooglh1mfqv an...
1    finally a transparant silicon case thanks to m...
2    we love this would you go talk makememories un...
3    im wired i know im george i was made that way ...
4    what amazing service apple wont even talk to m...
Name: tweet, dtype: object

In [32]:
x_train, x_test, y_train, y_test = train_test_split(train_data["tweet"],train_data["label"],test_size=0.2,random_state=3)

In [33]:
x_train.shape

(6336,)

In [34]:
x_test.shape

(1584,)

In [28]:
count_vect = CountVectorizer(stop_words='english')
transformer = TfidfTransformer(norm='l2',sublinear_tf=True)

In [29]:
## for transforming the 80% of the train data ##
x_train_counts = count_vect.fit_transform(x_train)
x_train_tfidf = transformer.fit_transform(x_train_counts)

In [31]:
## for transforming the 20% of the train data which is being used for testing ##
x_test_counts = count_vect.transform(x_test)
x_test_tfidf = transformer.transform(x_test_counts)

In [15]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=200)
model.fit(x_train_tfidf,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [16]:
predictions = model.predict(x_test_tfidf)

In [17]:
print(predictions)

[0 0 0 ... 0 0 0]


In [18]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,predictions))

0.8869949494949495


In [20]:
#importing test data
test_data = pd.read_csv("test_oJQbWVk.csv")

In [35]:
## for transforming the whole train data ##
train_counts = count_vect.fit_transform(train_data['tweet'])
train_tfidf = transformer.fit_transform(train_counts)

## for transforming the test data ##
test_counts = count_vect.transform(test_data['tweet'])
test_tfidf = transformer.transform(test_counts)

## fitting the model on the transformed train data ##
model.fit(train_tfidf,train_data['label'])

## predicting the results ##
predictions = model.predict(test_tfidf)


In [36]:
final_result = pd.DataFrame({'id':test_data['id'],'label':predictions})
final_result.to_csv('output.csv',index=False)