In [17]:
from sklearn.metrics import confusion_matrix
import pandas as pd 
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from joblib import dump, load
import json
from joblib import load


def df_merge(list_of_df, on):
    """
    Funkcja do łączenia dataframów z listy po kolumnie on
    """
    new_df = list_of_df[0]
    for i in list_of_df[1:]:
        new_df = new_df.merge(i, on = on)
    
    return new_df

def _label_to_country(df):
    """
    Jeżeli mamy w planach mergowanie to to funkcja po zmergowaniu
    Przyjmuje df z id: _label i zamieniam na _country
    """
    countries = []
    for i in df['_label']:
        countries.append(i.split('/')[5])
    df['_country']=countries
    df = df.drop("_label", axis=1)
    return df
    
def log_model(df,test_size,model_path):
    """
    Buduje model logistystyczny z penalty = l1, na podstawie df - dataframu
    Wpisujemy też wielkość próby testowej
    Funkcja zwraca dwa Seriesy z etykietami proby testowej i etykietami przewidywanymi przez model
    model_path - ścieżka gdzie chcemy zapisać model wraz z jego nazwą
    Zapisuje kolumny modelu do pliku json
    """
    X = df.drop("_country",axis=1)
    y = df['_country'].astype('category').cat.codes
    X = StandardScaler().fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_size, random_state=142)
    clf = LogisticRegression(C=0.01, penalty='l1', tol=0.01, solver='saga')#zmiana zeby szybciej liczyc i mniej zmniennych
    
    clf.fit(X_train, y_train)

    dump(clf, model_path + '.joblib')

    with open(model_path + 'columns.json', "w") as f:
        json.dump(list(df.drop('_country', axis=1 ).columns), f)


    y_pred = clf.predict(X_test)
    y_pred_train = clf.predict(X_train)
    
    return y_test, y_pred, y_train, y_pred_train

def confu_matrix_for_categories(df,y_test,y_pred,path,label_column='_country'):
    """
    Funkcja która, zrobi macierz gdy naszy predyktory są intami (nie krajami!)
    Przyjmuje df - wczytaną macierz wraz z 
    label_column - kolumną labelek
    """
    mark = pd.DataFrame(list(df['_country']),list(df['_country'].astype('category').cat.codes)).drop_duplicates()
    mark = mark.to_dict()[0]
    
    y_test =pd.DataFrame(y_test)
    y_test = y_test.replace({0: mark})
    
    y_pred =pd.DataFrame(y_pred)
    y_pred = y_pred.replace({0: mark})
    
    countries = list(np.unique(df['_country'].values))
    matrix = pd.DataFrame(confusion_matrix(y_test, y_pred, labels=countries), columns=countries, index=countries)
    pd.DataFrame(matrix).to_csv(path)
    
    return matrix

def accuracy_and_f1(y_test, y_pred,y_train,y_pred_train,path):
    """
    Wyświetla accuracy i f1 w modelu
    """
    acc_test = np.round(sum(y_test==y_pred)/len(y_test),2)
    acc_train = np.round(sum(y_train==y_pred_train)/len(y_train),2)
    f1 = np.round(f1_score(y_test, y_pred, average='macro'),2)
    stats=pd.DataFrame([acc_test,acc_train,f1])
    stats.index=['acc_test','acc_train','f_score']
    stats.columns=['value']
    stats.to_csv(path)
    
def print_coef_for_predict(model, columns, predicted_value, how_many,path):
    """
    Funkcja, która printuje how_many współczynników które wpłynęły na przewidzianą wartość
    Przyjmuje model na którym przewidujemy, predicted_value i liczbę pierwszych predyktorów
    i columns - predyktory na których przewidywał
    """
    val = pd.DataFrame(pd.DataFrame(model.coef_).iloc[predicted_value]).values
    df = pd.DataFrame(val[0],columns).sort_values(by = 0, ascending = False).head(how_many).rename({0:'value'}, axis= 'columns')
    df.to_csv(path)

    
def print_coef_for_predict2(model, columns, predicted_value, how_many):
    """
    Funkcja, która printuje how_many współczynników które wpłynęły na przewidzianą wartość
    Przyjmuje model na którym przewidujemy, predicted_value i liczbę pierwszych predyktorów
    i columns - predyktory na których przewidywał
    """
    val = pd.DataFrame(pd.DataFrame(model.coef_).iloc[predicted_value]).values
    df = pd.DataFrame(val[0],columns).sort_values(by = 0, ascending = False).head(how_many).rename({0:'value'}, axis= 'columns')
    return df
    

In [3]:
df123=pd.read_pickle("./Datasets/123gram.pkl")
df123=_label_to_country(df123)
y_test, t_pred , y_train, y_pred_train= log_model(df123, 0.5, './modele/_model123')
confu_matrix_for_categories(df123, y_test, t_pred, './modele/_model123conf_matrix.csv')
accuracy_and_f1(y_test, t_pred,y_train,y_pred_train, './modele/_stats123.csv')

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [None]:
mod = load('./modele/_model123.joblib')
print_coef_for_predict(mod,df123.drop('_country',axis=1).columns, [9], 10,'./modele/_model123important_features.csv')

In [20]:
from joblib import load
df123=pd.read_pickle("./Datasets/123gram.pkl")
df123 = _label_to_country(df123)
mod = load('./modele/model123.joblib')


Unnamed: 0,value
natural science,0.129008
natural science of,0.12393
the natural science,0.111779
by the natural,0.097914
science of,0.095535
science of under,0.062919
science of no,0.060734
key laboratory,0.060089
key laboratory of,0.057527
research fund for,0.048531


In [50]:
print_coef_for_predict(mod,df123.drop('_country',axis=1).columns, [1], 50,'./modele/france.csv')

In [22]:
countries = [ 'China', 'France', 'Germany', 'Italy', 'Japan', 'Poland', 'Russia',
   'Spain', 'Turkey', 'UK',  'Vietnam', 'USA']

In [45]:
val=print_coef_for_predict2(mod,df123.drop('_country',axis=1).columns, [i], 50).value*10000
words=pd.DataFrame(print_coef_for_predict2(mod,df123.drop('_country',axis=1).columns, [i], 50).index)

In [46]:
for i in range(1,12):
    val[countries[i]]=(print_coef_for_predict2(mod,df123.drop('_country',axis=1).columns, [i], 50).value*10000)
    words[countries[i]=pd.DataFrame(print_coef_for_predict2(mod,df123.drop('_country',axis=1).columns, [i], 50).index)

SyntaxError: invalid syntax (<ipython-input-46-df64fbf8bf9a>, line 3)

In [44]:
val
val[countries(0)]

TypeError: 'list' object is not callable

In [27]:
i=1
pd.DataFrame(print_coef_for_predict2(mod,df123.drop('_country',axis=1).columns, [i], 50).index)

Unnamed: 0,0
0,consist in
1,indeed the
2,whatever the
3,indeed
4,thank to
5,we propose to
6,thank
7,whatever
8,to take into
9,on fig


In [3]:
pd.read_csv('./modele/stats123.csv')

Unnamed: 0.1,Unnamed: 0,value
0,acc_test,0.87
1,acc_train,0.99
2,f_score,0.86


# Budowa modelu

In [169]:
#from sklearn.linear_model import LogisticRegressionCV

In [170]:
# df123 = pd.read_pickle("./Datasets/123gram.pkl")
# df = _label_to_country(df123)
# X = df.drop("_country",axis=1)
# y = df['_country'].astype('category').cat.codes
# X = StandardScaler().fit_transform(X)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [None]:
# from sklearn.model_selection import cross_val_score 
# from sklearn.linear_model import LogisticRegression
# clf = LogisticRegression(C = 0.1, penalty='l1', tol=0.0001, solver='saga')
# cross_val_score(clf, X, y, cv=5, scoring='accuracy')

