In [0]:
%%bash

# Install required packages
pip install -U -q git+https://github.com/facebookresearch/fastText.git
pip install -U -q pandas scikit-learn imblearn

# Download & extract dataset
wget -q -O data.zip https://he-s3.s3.amazonaws.com/media/hackathon/predict-the-happiness/predict-the-happiness/f2c2f440-8-dataset_he.zip
unzip -oqq data.zip train.csv

In [0]:
import pandas as pd
from fastText import train_supervised, load_model
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

### Загрузим данные, выровняем соотношение классов для упрощения задачи

In [3]:
df = pd.read_csv('train.csv', usecols=['Description', 'Is_Response'])\
  .rename(columns={'Description': 'x', 'Is_Response': 'y'})
df['y'] = df['y'].map(lambda x: x.replace(' ', '_'))

X, y = RandomUnderSampler(random_state=42).fit_resample(df[['x']], df['y'].values)
df = pd.DataFrame({'x': X[:, 0], 'y': y})

print(df.y.value_counts())
df.head()

happy        12411
not_happy    12411
Name: y, dtype: int64


Unnamed: 0,x,y
0,Henry is a big part of why I look forward to c...,happy
1,We stayed - nights at the blue sea lodge at th...,happy
2,I've stayed at several Embassy Suites in the p...,happy
3,I was recently in Chicago for business and sta...,happy
4,My wife and I recently traveled to NYC for fou...,happy


### Разобьем данные на обучающую и тестовую выборки

In [0]:
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['y'])

train_df = train_df.copy()
test_df = test_df.copy()

### Выполним минимальную необходимую предобработку данных

In [0]:
def preprocess(x):
  # TODO: ознакомиться с рекомендациями авторов fastText по предобработке и применить их
  return x.replace('\n', ' ').strip()


train_df['x'] = train_df['x'].map(preprocess)
test_df['x'] = test_df['x'].map(preprocess)

### Обучим и провалидируем модель

In [0]:
def save_df_in_fasttext_format(df, path, label_prefix='__label__'):
  with open(path, 'w+') as f:
    for _, row in df.iterrows():
      assert '\n' not in row['x']
      f.write('{}{} {}\n'.format(label_prefix, row['y'], row['x'].strip()))
      

def predict(model, df, label_prefix='__label__'):
  labels, probs = model.predict(df['x'].tolist())
  return [label_list[0].replace(label_prefix, '') for label_list in labels]


save_df_in_fasttext_format(train_df, 'train.data')

In [13]:
# TODO: ознакомиться с параметрами модели
model = train_supervised(input='train.data',
                         epoch=25,
                         lr=1.0,
                         wordNgrams=2,
                         verbose=2,
                         minCount=1,
                         label='__label__')
model.save_model('ft.model')
test_df['y_pred'] = predict(model, test_df)

print(classification_report(test_df['y'], test_df['y_pred']))

              precision    recall  f1-score   support

       happy       0.87      0.87      0.87      2482
   not_happy       0.87      0.87      0.87      2483

   micro avg       0.87      0.87      0.87      4965
   macro avg       0.87      0.87      0.87      4965
weighted avg       0.87      0.87      0.87      4965



### Выполним сжатие модели и повторно замерим качество

In [14]:
model.quantize(input='train.data', qnorm=True, retrain=True, cutoff=100000)
model.save_model('ft_quantized.model')

print(classification_report(test_df['y'], predict(model, test_df)))

              precision    recall  f1-score   support

       happy       0.87      0.86      0.87      2482
   not_happy       0.87      0.87      0.87      2483

   micro avg       0.87      0.87      0.87      4965
   macro avg       0.87      0.87      0.87      4965
weighted avg       0.87      0.87      0.87      4965



In [15]:
!ls -lah | grep model

-rw-r--r-- 1 root root 806M Nov 21 21:39 ft.model
-rw-r--r-- 1 root root 5.9M Nov 21 21:41 ft_quantized.model
