<a href="https://colab.research.google.com/github/shjain6670/Deep-Learning-/blob/master/Real_or_Not_Real%3F.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Loading Required Libraries and Dataset

In [0]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import feature_extraction, linear_model, model_selection, preprocessing

In [0]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

### A quick look at our data

In [4]:
train_df[train_df["target"] == 0]["text"].values[1] # Text with label 0

'I love fruits'

In [5]:
train_df[train_df["target"] == 1]["text"].values[1] # Text with label 1

'Forest fire near La Ronge Sask. Canada'

In [6]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [8]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
id          7613 non-null int64
keyword     7552 non-null object
location    5080 non-null object
text        7613 non-null object
target      7613 non-null int64
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


# Building vectors

In [0]:
count_vectorizer = feature_extraction.text.CountVectorizer()

## let's get counts for the first 5 tweets in the data
example_train_vectors = count_vectorizer.fit_transform(train_df["text"][0:5])

In [11]:
## we use .todense() here because these vectors are "sparse" (only non-zero elements are kept to save space)
print(example_train_vectors[0].todense().shape)
print(example_train_vectors[0].todense())

(1, 54)
[[0 0 0 1 1 1 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0
  0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0]]


In [0]:
train_vectors = count_vectorizer.fit_transform(train_df['text'])

## note that we're NOT using .fit_transform() here. Using just .transform() makes sure
# that the tokens in the train vectors are the only ones mapped to the test vectors - 
# i.e. that the train and test vectors use the same set of tokens.
test_vectors = count_vectorizer.transform(test_df["text"])

# Building Ridege Classifier Model

The presence of particular word (or set of words) in a tweet might link directly to whether or not that tweet is real.

What we're assuming here is a linear connection.

In [0]:
## Our vectors are really big, so we want to push our model's weights
## toward 0 without completely discounting different words - ridge regression 
## is a good way to do this.
clf = linear_model.RidgeClassifier()

### Cross - Validation

In [14]:
scores = model_selection.cross_val_score(clf, train_vectors, train_df['target'], cv = 3, scoring = "f1")
scores

array([0.59453669, 0.56498283, 0.64082434])

In [15]:
clf.fit(train_vectors, train_df["target"])

RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=None, normalize=False, random_state=None,
                solver='auto', tol=0.001)

In [0]:
sample_submission = pd.read_csv("sample_submission.csv")

In [0]:
sample_submission["target"] = clf.predict(test_vectors)

In [18]:
sample_submission.head()

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,1
3,9,0
4,11,1


In [0]:
sample_submission.to_csv("submission.csv", index=False)

## Second Ridge Classifier Model

In [0]:
from sklearn.model_selection import GridSearchCV

# Setup the array of alphas and lists to store scores
alpha_space = np.logspace(-4, 0, 50)
# Setup the hyperparameter grid
param_grid = {'alpha': alpha_space}

In [0]:
clf2 = linear_model.RidgeClassifier()

In [0]:
# Instantiate the GridSearchCV object: clf_2_cv
clf_2_cv = GridSearchCV(clf2, param_grid, cv=5, scoring = 'f1') # Choosing Scoring is f1

In [28]:
clf_2_cv.fit(train_vectors, train_df["target"])

GridSearchCV(cv=5, error_score=nan,
             estimator=RidgeClassifier(alpha=1.0, class_weight=None,
                                       copy_X=True, fit_intercept=True,
                                       max_iter=None, normalize=False,
                                       random_state=None, solver='auto',
                                       tol=0.001),
             iid='deprecated', n_jobs=None,
             param_grid={'alpha': array([1.00000000e-04, 1.20679264e-04, 1.45634848e-04, 1.75751062e-04,
       2.12095089e-04, 2.55954792e-04, 3.08884360e-...
       4.09491506e-02, 4.94171336e-02, 5.96362332e-02, 7.19685673e-02,
       8.68511374e-02, 1.04811313e-01, 1.26485522e-01, 1.52641797e-01,
       1.84206997e-01, 2.22299648e-01, 2.68269580e-01, 3.23745754e-01,
       3.90693994e-01, 4.71486636e-01, 5.68986603e-01, 6.86648845e-01,
       8.28642773e-01, 1.00000000e+00])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='

In [30]:
print("Tuned Ridge classifier Parameters: {}".format(clf_2_cv.best_params_)) 
print("Best score is {}".format(clf_2_cv.best_score_))

Tuned Ridge classifier Parameters: {'alpha': 1.0}
Best score is 0.5708509197504545


In [0]:
sample_submission["target"] = clf_2_cv.predict(test_vectors)

In [32]:
sample_submission.head()

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,1
3,9,0
4,11,1


In [0]:
sample_submission.to_csv("submission.csv", index=False)