In [292]:
import string
from sklearn import preprocessing
import pandas as pd
import plotly.express as px
from tqdm import tqdm
tqdm.pandas()


In [293]:
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
from string import punctuation
english_stopwords = stopwords.words("english")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rafaga\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [294]:
from sklearn.pipeline import Pipeline
# pipeline позволяет объединить в один блок трансформер и модель, что упрощает написание кода и улучшает его читаемость
from sklearn.feature_extraction.text import TfidfVectorizer
# TfidfVectorizer преобразует тексты в числовые векторы, отражающие важность использования каждого слова из некоторого набора слов (количество слов набора определяет размерность вектора) в каждом тексте
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
# линейный классификатор и классификатор методом ближайших соседей
from sklearn import metrics
# набор метрик для оценки качества модели

In [295]:
DF = pd.read_excel("input/Content Automation (1).xlsx", header=1)

In [296]:
DF.head()

Unnamed: 0,Subject,Text Modification,TEXT,Style,Length\n (Words),Keywords,Clarity of the point,Completeness,Style as per requested,Length,Keywords usage,Rank,Avg,Word Count
0,1.0,1.0,Friction is the force that resists motion when...,Academic,110.0,- Friction\n- Static Friction\n- Sliding Frict...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,110.0
1,1.0,2.0,Friction is the force resisting the relative m...,Academic,110.0,- Friction\n- Static Friction\n- Sliding Frict...,0.9,1.0,0.9,1.0,0.5,2.0,0.86,113.0
2,1.0,3.0,Friction is the force between two surfaces tha...,Academic,110.0,- Friction\n- Static Friction\n- Sliding Frict...,0.8,0.85,0.9,0.9,0.8,3.0,0.85,102.0
3,1.0,4.0,Friction is the force that resists motion when...,Academic,110.0,- Friction\n- Static Friction\n- Sliding Frict...,0.95,0.8,0.7,0.9,0.8,4.0,0.83,106.0
4,1.0,5.0,Friction is the unseen force that resists moti...,Academic,110.0,- Friction\n- Static Friction\n- Sliding Frict...,0.8,0.8,0.6,0.8,1.0,5.0,0.8,109.0


Посмотрим на разменрность данных

In [297]:
DF.shape

(605, 14)

Построим распределение стилей текста

In [298]:
DF["Style"].value_counts()

Academic        115
Informative      95
Analytical       60
Casual           60
Descriptive      50
For Kids         45
Helpful          40
Motivational     30
Explainer         5
Detailed          5
Name: Style, dtype: int64

In [299]:
fig = px.bar(DF, x='Style',
             hover_data=['Style as per requested', 'Completeness', 'Clarity of the point', 'Length', 'Keywords usage', 'Word Count'], color='Word Count',
             labels={'pop':'population of Canada'}, height=400)
fig.show()

В таблице присутсвтуют пустые значения в кол-ве слов


In [300]:
DF.dropna(inplace=True)
test_df = DF[DF.isnull().any(1)]
test_df.head()

Unnamed: 0,Subject,Text Modification,TEXT,Style,Length\n (Words),Keywords,Clarity of the point,Completeness,Style as per requested,Length,Keywords usage,Rank,Avg,Word Count


In [301]:
def word_count(string):
    return len(string.strip().split(" "))

DF['Word Count'] = DF['TEXT'].apply(lambda x : word_count(x))

In [302]:
# Данные стили убрал, по ним 2-5 примеров
DF = DF[(DF['Style'] != 'Detailed') & (DF['Style'] != 'Explainer')]

In [303]:
DF['Style'].value_counts()

Academic        110
Informative      85
Casual           55
Analytical       50
Descriptive      50
For Kids         40
Helpful          40
Motivational     30
Name: Style, dtype: int64

In [304]:
# Использую кодировку для значений стиля
le = preprocessing.LabelEncoder()
DF['StyleLabel'] = le.fit_transform(DF['Style'])
DF.groupby(['Style'])['StyleLabel'].value_counts()


Style         StyleLabel
Academic      0             110
Analytical    1              50
Casual        2              55
Descriptive   3              50
For Kids      4              40
Helpful       5              40
Informative   6              85
Motivational  7              30
Name: StyleLabel, dtype: int64

In [305]:
DF

Unnamed: 0,Subject,Text Modification,TEXT,Style,Length\n (Words),Keywords,Clarity of the point,Completeness,Style as per requested,Length,Keywords usage,Rank,Avg,Word Count,StyleLabel
0,1.0,1.0,Friction is the force that resists motion when...,Academic,110.0,- Friction\n- Static Friction\n- Sliding Frict...,1.00,1.00,1.00,1.00,1.00,1.0,1.00,110,0
1,1.0,2.0,Friction is the force resisting the relative m...,Academic,110.0,- Friction\n- Static Friction\n- Sliding Frict...,0.90,1.00,0.90,1.00,0.50,2.0,0.86,113,0
2,1.0,3.0,Friction is the force between two surfaces tha...,Academic,110.0,- Friction\n- Static Friction\n- Sliding Frict...,0.80,0.85,0.90,0.90,0.80,3.0,0.85,102,0
3,1.0,4.0,Friction is the force that resists motion when...,Academic,110.0,- Friction\n- Static Friction\n- Sliding Frict...,0.95,0.80,0.70,0.90,0.80,4.0,0.83,106,0
4,1.0,5.0,Friction is the unseen force that resists moti...,Academic,110.0,- Friction\n- Static Friction\n- Sliding Frict...,0.80,0.80,0.60,0.80,1.00,5.0,0.80,109,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
600,100.0,1.0,Automation describes a wide range of technolog...,Academic,65.0,- Automation\n- Technology\n- Machine,1.00,1.00,1.00,1.00,1.00,1.0,1.00,64,0
601,100.0,2.0,Automation is the process of automating tasks....,Academic,65.0,- Automation\n- Technology\n- Machine,0.80,0.80,0.85,1.00,0.35,3.0,0.76,67,0
602,100.0,3.0,"In the future, automation will be the norm. Ne...",Academic,65.0,- Automation\n- Technology\n- Machine,0.70,0.75,0.65,1.00,0.35,4.0,0.69,71,0
603,100.0,4.0,If you're looking for a way to totally disrupt...,Academic,65.0,- Automation\n- Technology\n- Machine,0.60,0.55,0.50,1.00,0.35,5.0,0.60,68,0


In [306]:
# Удаление знаков пунктуации из текста
def remove_punct(text):
    table = {33: ' ', 34: ' ', 35: ' ', 36: ' ', 37: ' ', 38: ' ', 39: ' ', 40: ' ', 41: ' ', 42: ' ', 43: ' ', 44: ' ', 45: ' ', 46: ' ', 47: ' ', 58: ' ', 59: ' ', 60: ' ', 61: ' ', 62: ' ', 63: ' ', 64: ' ', 91: ' ', 92: ' ', 93: ' ', 94: ' ', 95: ' ', 96: ' ', 123: ' ', 124: ' ', 125: ' ', 126: ' '}
    return text.translate(table)

In [307]:
# Очистка текста от лишних пунктуаций и символов
DF['TEXT_clean'] = DF['TEXT'].map(lambda x: x.lower())
DF['TEXT_clean'] = DF['TEXT_clean'].map(lambda x: remove_punct(x))
DF['TEXT_clean'] = DF['TEXT_clean'].map(lambda x: x.split(' '))
DF['TEXT_clean'] = DF['TEXT_clean'].map(lambda x: [token for token in x if token not in english_stopwords\
                                                                  and token != " " \
                                                                  and token.strip() not in punctuation])
DF['TEXT_clean'] = DF['TEXT_clean'].map(lambda x: ' '.join(x))

In [308]:
# Функция для разбиения датасета на тренировочную и тестовую часть
def dataset_train_test(df, size_train = 1):
  train = pd.DataFrame()
  test = pd.DataFrame()
  for style in df['Style'].unique():
    df_2 =  df[df['Style'] == style]
    value_train = round((df_2.shape[0]/5) * size_train)
    train = train.append(df_2.iloc[0 : value_train * 5])  
    test = test.append(df_2.iloc[( value_train * 5 + 1) : ]) 
  return train, test

In [309]:
Train , Test = dataset_train_test(DF)


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated a

In [310]:
Train['Style'].value_counts()

Academic        110
Informative      85
Casual           55
Analytical       50
Descriptive      50
For Kids         40
Helpful          40
Motivational     30
Name: Style, dtype: int64

In [311]:
Test['Style'].value_counts()

Series([], Name: Style, dtype: int64)

In [312]:
# Предварительная проверка обучения на стиле текста
X_train = Train['TEXT_clean']
y_train = Train['StyleLabel']
X_test = Train['TEXT_clean']
y_test = Train['StyleLabel']



sgd_ppl_clf = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2))),
    ('sgd_clf', SGDClassifier(penalty='elasticnet', class_weight='balanced', random_state=42))])
knb_ppl_clf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('knb_clf', KNeighborsClassifier(n_neighbors=10))])
sgd_ppl_clf.fit(X_train, y_train)
knb_ppl_clf.fit(X_train, y_train)




In [313]:
predicted_sgd = sgd_ppl_clf.predict(X_test)
print(metrics.classification_report(predicted_sgd, y_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       110
           1       1.00      1.00      1.00        50
           2       1.00      1.00      1.00        55
           3       1.00      1.00      1.00        50
           4       1.00      1.00      1.00        40
           5       1.00      1.00      1.00        40
           6       1.00      1.00      1.00        85
           7       1.00      1.00      1.00        30

    accuracy                           1.00       460
   macro avg       1.00      1.00      1.00       460
weighted avg       1.00      1.00      1.00       460



In [314]:
test_text = '''French is not only an important language for international relations, but also for business and trade. As the sixth largest economy in the world, France is a major player in both the European Union and the global market. Whether you're looking to study abroad in France, do business with a French company, or simply order a baguette at your local bakery, learning French will give you a leg up. Hence it is important to learn French. It can be very helpful'''
test_text = test_text.lower()
test_text = remove_punct(test_text)
test_text = test_text.split(' ')
test_text = [token for token in test_text if token not in english_stopwords\
                                                                  and token != " " \
                                                                  and token.strip() not in punctuation]
test_text = ' '.join(test_text)
test_dict = {'test_text': [test_text]}
df_test = pd.DataFrame.from_dict(test_dict)
df_test.head()
predicted_sgd = knb_ppl_clf.predict(df_test['test_text'])
# predicted_sgd = knb_ppl_clf.predict(X_test)
print(predicted_sgd)
print(metrics.classification_report(predicted_sgd, [0]))

[0]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1

    accuracy                           1.00         1
   macro avg       1.00      1.00      1.00         1
weighted avg       1.00      1.00      1.00         1

