In [1]:
!kaggle datasets download lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
# мы взяли очень простой датасет с одним - но зато очень интересным признаком. это обзоры на 
# кинофильмы, и наша цель - понять, отрицательные это обзоры ли положительные. это задача NLP,
# а значит здесь нам необходим будет TfidfVectorizer! 

Downloading imdb-dataset-of-50k-movie-reviews.zip to /home/poli/Documents/data_science/ml/06
100%|██████████████████████████████████████| 25.7M/25.7M [00:05<00:00, 6.30MB/s]
100%|██████████████████████████████████████| 25.7M/25.7M [00:05<00:00, 4.69MB/s]


In [2]:
!unzip imdb-dataset-of-50k-movie-reviews.zip

Archive:  imdb-dataset-of-50k-movie-reviews.zip
  inflating: IMDB Dataset.csv        


In [3]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
# нельзя сказать, что логистическая регрессия наш любимый классификатор, но он показал очень 
# хорошие результаты, нет смысла строить чрезмерно сложную модель
from sklearn.metrics import precision_recall_curve, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [4]:
import warnings
warnings.simplefilter('ignore')

In [5]:
def metrics_counter(model, y_test, y_pred):
    """
    function took true values and predicted values, returns main DS metrics
    model: str
    y_test: pd.Series
    y_pred: np.array
    """
    precision, recall, thresholds = precision_recall_curve(y_test, y_pred)
    fscore = (2 * precision * recall) / (precision + recall)
    ix = np.argmax(fscore)
    
    threshold = round(thresholds[ix], 3)
    fscore = round(fscore[ix], 3)
    precision = round(precision[ix], 3)
    recall = round(recall[ix], 3)
    roc_auc = round(roc_auc_score(y_test, y_pred), 3)
    
    return model, threshold, fscore, precision, recall, roc_auc

In [6]:
def create_unlabeled_sets(df, pos_frac):
    """
    function took dataframe and pos_fractions, returns modified dataframe
    df: pd.Dataframe
    pos_frac: float
    """
    df_u = df.copy()
    pos_mask = (df_u['sentiment'] == 1)
    pos_ind = df_u[pos_mask].sample(frac=pos_frac).index
    unlab_ind = df_u[~df_u.index.isin(pos_ind)].index
    
    df_u.loc[pos_ind, 'is_labeled'] = 1
    df_u.loc[unlab_ind, 'is_labeled'] = 0
    df_u['is_labeled'] = df_u['is_labeled'].astype(int)
    return df_u

In [7]:
def get_train_test_samples(df):
    """
    function took dataframe, returns train and test samples
    df: pd.Dataframe
    """
    df = df.sample(frac=1)

    pos_sample = df[df['is_labeled'] == 1]
    neg_sample = df[df['is_labeled'] == 0][:pos_sample.shape[0]]
    train_samples = pd.concat([neg_sample, pos_sample]).sample(frac=1)
    test_samples = df[df['is_labeled'] == 0][pos_sample.shape[0]:]
    
    return train_samples, test_samples

In [8]:
df = pd.read_csv('IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [9]:
df.value_counts(['sentiment'])

sentiment
negative     25000
positive     25000
Name: count, dtype: int64

In [10]:
df['sentiment'] = df['sentiment'].replace(['negative', 'positive'], [0, 1])
# засим feature engeneering для этого датасета завершен, но тут важен анализ признака review, 
# поэтому мы лишь преобразуем целевой признак к целочисленному типу

In [11]:
x_train, x_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], 
                                                    random_state=42)

In [12]:
pipeline = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')), 
                     ('clf', LogisticRegression())])

In [13]:
pipeline.fit(x_train, y_train)

In [14]:
preds = pipeline.predict_proba(x_test)[:, 1]
preds[:10]

array([0.30522345, 0.79247694, 0.04648578, 0.93588426, 0.18650531,
       0.9009605 , 0.97690321, 0.41042184, 0.31562173, 0.09871611])

In [15]:
metrics = pd.DataFrame(columns=['model', 'threshold', 'f1-score', 
                                'precision', 'recall', 'roc auc'])
metrics.loc[len(metrics.index)] = metrics_counter('Logistic Regression', y_test, preds)
metrics

Unnamed: 0,model,threshold,f1-score,precision,recall,roc auc
0,Logistic Regression,0.499,0.899,0.887,0.912,0.961


In [16]:
# создаем датасету новые метки классов - позитивные (1) и неразмеченные (0). для удобства мы 
# сразу присвоили им целочисленные значения 1 и 0
new_model = create_unlabeled_sets(df, pos_frac=.25)
new_model.head()

Unnamed: 0,review,sentiment,is_labeled
0,One of the other reviewers has mentioned that ...,1,0
1,A wonderful little production. <br /><br />The...,1,0
2,I thought this was a wonderful way to spend ti...,1,0
3,Basically there's a family where a little boy ...,0,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,0


In [17]:
new_model.shape

(50000, 3)

In [18]:
new_model['is_labeled'].value_counts()

is_labeled
0    43750
1     6250
Name: count, dtype: int64

In [19]:
train, test = get_train_test_samples(new_model)

In [20]:
pipeline = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')), 
                     ('clf', LogisticRegression())])

In [21]:
pipeline.fit(train['review'], train['is_labeled'])

In [22]:
preds = pipeline.predict_proba(test['review'])[:, 1]
preds[:10]

array([0.21218308, 0.11193907, 0.48465044, 0.52254905, 0.5612756 ,
       0.28149741, 0.59258879, 0.04554816, 0.46252658, 0.20355604])

In [23]:
metrics_rns = metrics_counter('RNS', test['sentiment'], preds)

In [24]:
metrics.loc[len(metrics.index)] = metrics_rns
metrics

Unnamed: 0,model,threshold,f1-score,precision,recall,roc auc
0,Logistic Regression,0.499,0.899,0.887,0.912,0.961
1,RNS,0.441,0.824,0.794,0.856,0.922


In [25]:
# прекрасные результаты! RNS показала себя чуть хуже, чем модель на размеченных данных, но все
# равно метрики очень высокие.

In [26]:
metrics_fracs = pd.DataFrame(columns=['frac', 'threshold', 'f1-score', 
                                      'precision', 'recall', 'roc auc'])

fracs = np.linspace(.1, .9, 9)

In [27]:
for frac in fracs:
    new_model = create_unlabeled_sets(df, pos_frac=frac)
    
    train, test = get_train_test_samples(new_model)
    
    pipeline = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')), 
                     ('clf', LogisticRegression())])
    
    pipeline.fit(train['review'], train['is_labeled'])
    
    preds = pipeline.predict_proba(test['review'])[:, 1]
    
    metrics = metrics_counter(frac, test['sentiment'], preds)
    
    metrics_fracs.loc[len(metrics_fracs.index)] = metrics
    
metrics_fracs    

Unnamed: 0,frac,threshold,f1-score,precision,recall,roc auc
0,0.1,0.449,0.817,0.783,0.854,0.902
1,0.2,0.441,0.829,0.798,0.862,0.922
2,0.3,0.438,0.828,0.789,0.872,0.931
3,0.4,0.454,0.83,0.802,0.859,0.941
4,0.5,0.471,0.822,0.797,0.847,0.944
5,0.6,0.48,0.81,0.77,0.854,0.946
6,0.7,0.503,0.794,0.748,0.846,0.951
7,0.8,0.597,0.774,0.788,0.761,0.955
8,0.9,0.683,0.698,0.713,0.684,0.956


In [28]:
# едва ли не на каждую метрику приходится собственная доля позитивного класса. для f1 score и 
# precision это 0.3, для recall 0.2, для roc auc 0.9