In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load Data

In [None]:
train_data = pd.read_csv('../input/nlp-getting-started/train.csv')
test_data = pd.read_csv('../input/nlp-getting-started/test.csv')

print(f'train shape => {train_data.shape}')
print(f'test shape => {test_data.shape}')

# Examples data

In [None]:
train_data.sample(5)

In [None]:
test_data.sample(5)

# Describe

In [None]:
train_data.describe(include=['O'])

In [None]:
test_data.describe(include=['O'])

# Missing data

In [None]:
def missing_values(data: pd.DataFrame) -> pd.DataFrame:
    miss_value_percent = data.isna().sum() / data.shape[0] * 100
    return pd.DataFrame(miss_value_percent, columns=['Missing_percent']).query('Missing_percent > 0')

In [None]:
missing_values(train_data)

In [None]:
missing_values(test_data)

# Distribution of the target variable

In [None]:
sns.countplot(train_data.target);
plt.ylabel('Samples');

# Examples tweets

In [None]:
train_data.query('target == 0').text.values[0]

In [None]:
train_data.query('target == 1').text.values[0]

# Distribution of the target variable

In [None]:
train_data.location.unique()

In [None]:
train_data.location.nunique()

In [None]:
sns.barplot(y=train_data['location'].value_counts()[:10].index,
            x=train_data['location'].value_counts()[:10]);

In [None]:
train_data.text.values

# Preprocessing

In [None]:
import re
import unidecode

In [None]:
url = re.compile('https?://\S+|www\.\S+')

In [None]:
url.sub('', 'M1.94 [01:04 UTC]?5km S of Volcano Hawaii. http://t.co/zDtoyd8EbJ')

In [None]:
url.sub('', 'M1.94 [01:04 UTC]?5km S of Volcano Hawaii. http://t.co/zDtoyd8EbJ')

In [None]:
import string

In [None]:
def preprocessing(text):
    text = text.lower()
    text = unidecode.unidecode(text)
    
    text = re.sub('\[.*?\]', '', text)
    
    text_without_url = re.sub('https?://\S+|www\.\S+', '', text)
    
    text_without_tag = re.sub('<.*?>+', '', text_without_url)
    
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text_without_tag)
    
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    
    return text

In [None]:
train_data = pd.read_csv('../input/nlp-getting-started/train.csv')
test_data = pd.read_csv('../input/nlp-getting-started/test.csv')

In [None]:
train_data.text = train_data.text.apply(preprocessing)
test_data.text = test_data.text.apply(preprocessing)

In [None]:
train_data.text

# Tokenization

In [None]:
import nltk

In [None]:
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')

In [None]:
train_data.text = train_data.text.apply(tokenizer.tokenize)
test_data.text = test_data.text.apply(tokenizer.tokenize)

In [None]:
train_data

# Remove stopwords

In [None]:
from nltk.corpus import stopwords

In [None]:
train_data.text = train_data.text.apply(lambda text: [w for w in text if w not in stopwords.words('english')])
test_data.text = test_data.text.apply(lambda text: [w for w in text if w not in stopwords.words('english')])

In [None]:
train_data.text = train_data.text.apply(lambda text: ' '.join(text))
test_data.text = test_data.text.apply(lambda text: ' '.join(text))

In [None]:
train_data.text

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_data.text.values,
                                                    train_data.target.values,
                                                    stratify=train_data.target.values,
                                                    test_size=0.2, random_state=1)

In [None]:
tfidf = TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(1, 2))
train_tfidf = tfidf.fit_transform(X_train)
test_tfidf = tfidf.transform(X_test)
test = tfidf.transform(test_data.text)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score

In [None]:
# def grid_params_model(model, params, X_train, y_train, X_dev, y_dev):
# #     clf = Pipeline(steps=[('preprocessor', preprocessor),
# #                       ('model', model)
# #                      ])
#     print(clf.get_params().keys())
#     grid_search = GridSearchCV(clf, params, cv=5, scoring='f1')
#     grid_search.fit(X_train, y_train)

#     model_gs_best = grid_search.best_estimator_ 

#     metrics_name = ["Mean cross-validated score", "f1_score train", "f1_score dev"]
#     metrics_values = [grid_search.best_score_,
#                     f1_score(y_train, model_gs_best.predict(X_train)),
#                     f1_score(y_dev, model_gs_best.predict(X_dev))
#                      ]

#     metrics_res = {k : v for k, v in zip(metrics_name, metrics_values)}

#     return model_gs_best, metrics_res

In [None]:
clf = LogisticRegression(C=1.0)
scores = cross_val_score(clf, train_tfidf, y_train, cv=5, scoring="f1")
scores

In [None]:
clf.fit(train_tfidf, y_train)

In [None]:
f1_score(y_test, clf.predict(test_tfidf))

In [None]:
clf.predict(test)

In [None]:
pd.read_csv('../input/nlp-getting-started/test.csv').id.values

In [None]:
submission = pd.DataFrame({'id': pd.read_csv('../input/nlp-getting-started/test.csv').id.values,
                           'target': clf.predict(test)})

In [None]:
submission.to_csv('submission.csv', index=False)