In [None]:
import pandas as pd
import numpy as np
import csv
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process.kernels import RBF
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from pymystem3 import Mystem
from string import punctuation
from nltk.stem import SnowballStemmer
from google.colab import drive

drive.mount("/content/drive")

#### Часть 1 ####

df = pd.read_csv("/content/drive/My Drive/Lab5/lab4v2.csv", delimiter=",", index_col=[0], na_values=['NA'], low_memory=False)

def get_calss_name(name):
  if "разработчик" in name or "программист" in name: return "Разработчик" 
  if "менеджер" in name or "manager" in name: return "Менеджер" 
  if "администратор" in name: return "Администратор" 
  if "analyst" in name or "аналитик" in name: return "Аналитик" 
  if "художник" in name: return "Художник" 
  if "аниматор" in name: return "Аниматор" 
  if "дизайнер" in name: return "Дизайнер" 
  if "геймдизайнер" in name: return "Геймдизайнер" 
  if "devops" in name: return "devops" 
  if "тест" in name or "tester" in name: return "Тестировщик"
  return "Другое"

df['Class'] = df['Vacancy Name'].apply(get_calss_name)

df["Salary Min"] = df.groupby(["Class"]).transform(lambda x: x.fillna(x.mean()))["Salary Min"]
df["Salary Max"] = df.groupby(["Class"]).transform(lambda x: x.fillna(x.mean()))["Salary Max"]

encoder = LabelEncoder()
df['Class'] = encoder.fit_transform(df['Class'])
f = open("/content/drive/My Drive/Lab5/class.txt", "w")
num = 0
for c in encoder.classes_:
  f.write(str(num)+" "+c+"\r\n")
  num += 1
f.close()
#np.save('/content/drive/My Drive/Lab5/classes.npy', encoder.classes_)

df = df.drop(labels=['Vacancy Name', 'City', 'Company Name', 'Date', 'Description', 'Responsibility','Requirement'], axis=1)

def onehotencode(df, coln_name):
  encoder = OneHotEncoder(sparse=False)
  encoded = encoder.fit_transform(pd.DataFrame(df[coln_name]))
  df = pd.concat([df.drop(coln_name, 1), pd.DataFrame(encoded, columns=encoder.get_feature_names())], axis=1).reindex()
  return encoder, df

def onehotencode_t(encoder, df, coln_name):
  encoded = encoder.transform(pd.DataFrame(df[coln_name]))
  df = pd.concat([df.drop(coln_name, 1), pd.DataFrame(encoded, columns=encoder.get_feature_names())], axis=1).reindex()
  return df

Expierence_encoder, df = onehotencode(df, 'Expierence')
Employment_encoder, df = onehotencode(df, 'Employment')
Schedule_encoder, df = onehotencode(df, 'Schedule')

mystem = Mystem() 
nltk.download('punkt')
nltk.download('stopwords')
russian_stopwords = stopwords.words("russian")
snowball = SnowballStemmer(language="russian")

def proctext(text):
  filtered_tokens = []
  for token in word_tokenize(text, language="russian"):
    if token not in russian_stopwords and token not in punctuation:
        filtered_tokens.append(snowball.stem(token))
  return ' '.join(filtered_tokens)

df['Key Skills'] = df['Key Skills'].apply(proctext)

text_transformer = CountVectorizer()
text = text_transformer.fit_transform(df['Key Skills'])
words = pd.DataFrame(text.toarray(), columns=text_transformer.get_feature_names())
df = pd.concat([df, words], axis=1).drop(['Key Skills'], axis=1)

data = df.drop(labels=['Class'], axis=1)
target = df['Class']
train_data, test_data, train_target, test_target = train_test_split(data, target, test_size=0.3)

m = MLPClassifier(alpha=1, max_iter=1000)
m.fit(train_data, train_target)
print("MLPClassifier score: "+str(m.score(test_data, test_target)))
m = KNeighborsClassifier(n_neighbors=35)
m.fit(train_data, train_target)
print("KNeighborsClassifier score: "+str(m.score(test_data, test_target)))
m = RandomForestClassifier(n_estimators=160, max_depth=120)
m.fit(train_data, train_target)
print("RandomForestClassifier score: "+str(m.score(test_data, test_target)))
m = AdaBoostClassifier(n_estimators=500)
m.fit(train_data, train_target)
print("AdaBoostClassifier score: "+str(m.score(test_data, test_target)))
m = MultinomialNB() 
m.fit(train_data, train_target)
print("MultinomialNB score: "+str(m.score(test_data, test_target)))


#### Часть 2 ####


df_original = pd.read_csv("/content/drive/My Drive/Lab5/lab4-2v2.csv", delimiter=",", index_col=[0], na_values=['NA'], low_memory=False)
df_original = df_original.dropna(subset=['Salary Min', 'Salary Max'])
df_original.reset_index(drop=True, inplace=True)
df_original['Class'] = df_original['Vacancy Name'].apply(get_calss_name)

df_test = pd.DataFrame(df_original)
df_test['Class'] = encoder.transform(df_test['Class'])
df_test = onehotencode_t(Expierence_encoder, df_test, 'Expierence')
df_test = onehotencode_t(Employment_encoder, df_test, 'Employment')
df_test = onehotencode_t(Schedule_encoder, df_test, 'Schedule')
df_test = df_test.drop(labels=['Vacancy Name', 'City', 'Company Name', 'Date', 'Description', 'Responsibility','Requirement'], axis=1)

df_test['Key Skills'] = df_test['Key Skills'].apply(proctext)
text = text_transformer.transform(df_test['Key Skills'])
words = pd.DataFrame(text.toarray(), columns=text_transformer.get_feature_names())
df_test = pd.concat([df_test, words], axis=1).drop(['Key Skills'], axis=1)

df_test_data = df_test.drop(labels=['Class'], axis=1)
df_test_target = df_test['Class']

model = RandomForestClassifier(n_estimators=160, max_depth=120)
model.fit(data, target)
print("Result score: "+str(model.score(df_test_data, df_test_target)))
df_original["Class (predicted)"] = model.predict(df_test_data)
df_original["Class (predicted)"] = encoder.inverse_transform(df_original["Class (predicted)"].values)
df_original["Class"] = encoder.inverse_transform(df_original["Class"].values)
df_original.to_csv("/content/drive/My Drive/Lab5/lab5.csv",  na_rep = 'NA', index = True, index_label = "", quotechar = '"', quoting = csv.QUOTE_NONNUMERIC, encoding = "utf-8-sig")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
MLPClassifier score: 0.5368066598073146
KNeighborsClassifier score: 0.7748573566551304
RandomForestClassifier score: 0.8274249368627817
AdaBoostClassifier score: 0.5246468992610607
MultinomialNB score: 0.3761107473575905
Result score: 0.7356643356643356
