In [None]:
import numpy as np
import pandas as pd

In [None]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pickle
import re
from tqdm import tqdm

import nltk
from nltk import word_tokenize
from nltk import punkt
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import string

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')
stoplist = set(stopwords.words('english'))
id_column = 'id'
missing_token = 'UNK'

In [None]:
df = pd.read_csv('../input/us-patent-phrase-to-phrase-matching/train.csv')

In [None]:
df.info()

In [None]:
df.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df.describe ()

In [None]:
df.score.value_counts()

In [None]:
count_list = [(df.score == '0.50').sum(), (df.score == '0.25').sum(), (df.score == '0.00').sum(), (df.score == '0.75').sum(), (df.score == '1.00').sum()]
label_list = list(df['score'].unique())
plt.figure(figsize = (10, 7))
plt.pie(count_list, labels = label_list, autopct = '%.2f %%', startangle = 90, explode = (0.1, 0.1, 0.0), textprops = {'fontsize': 12})
plt.title('Distribution of Scores', fontsize = 20)
plt.show()

In [None]:
df['anchor'].value_counts()

In [None]:
df['target'].value_counts()

In [None]:
df['context'].value_counts()

In [None]:
df['score'].value_counts()

# encode the response

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
df['score'] = le.fit(df['score'])

In [None]:
df['score'].value_counts()

In [None]:
y = df['score']

In [None]:
pickle.dump(y, open('y.pkl','wb'))

In [None]:
df.info()

## factorize context since not too many levels ~100 for 30k observations

In [None]:
df['context'].nunique()

In [None]:
df['context'] = df['context'].astype('category').cat.codes

In [None]:
df.info()

In [None]:
df.head(2)

## analyze the text

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
def getTextFeatures(T, Col, max_features=1000): #, ngrams=(1,2), verbose=True):
    #if verbose:
    #    print('processing: ', Col)
    vectorizer = TfidfVectorizer() #ngram_range=ngrams)
    X = vectorizer.fit_transform(T[Col])
    return X, vectorizer.get_feature_names()

In [None]:
X_anc, feat_anc = getTextFeatures(df, 'anchor')

In [None]:
X_tar, feat_tar = getTextFeatures(df, 'target')

In [None]:
pickle.dump(X_anc, open('X_anc.pkl','wb'))

In [None]:
pickle.dump(X_tar, open('X_tar.pkl','wb'))

In [None]:
pickle.dump(feat_anc, open('feat_anc.pkl','wb'))

In [None]:
pickle.dump(feat_tar, open('feat_tar.pkl','wb'))

In [None]:
pickle.dump(df, open('df.pkl','wb'))

In [None]:
print(X_anc)

In [None]:
print(X_tar)

In [None]:
df.info()

In [None]:
from scipy.sparse import hstack, csr_matrix
from sklearn.preprocessing import StandardScaler

In [None]:
#X_anc= X_anc.reshape(-1, 1)
#X_tar = X_tar.reshape(-1, 1)
#X_anc= set(X_anc)
#X_tar= set(X_tar)

X_ = hstack(X_anc, X_tar)

In [None]:
X_anc2D = [list(item) for item in X_anc]

In [None]:
X_tar2D = [list(item) for item in X_tar]

In [None]:
X_ = hstack(X_anc2D, X_tar2D)

In [None]:
X_ancD = X_anc.toarray()

In [None]:
X_tarD = X_tar.toarray()

In [None]:
X_ = hstack(X_ancD, X_tarD)

In [None]:
X_anc_den = csr_matrix.toarray(X_anc, out='2-D')

In [None]:
X_tar_den = csr_matrix.toarray(X_tar)

In [None]:
X_ = hstack(X_ancD, X_tarD)

In [None]:
X_2 = hstack(X_ancD, X_tarD)

In [None]:
print(X_ancD)

In [None]:
print(X_tarD)

In [None]:
context_ = df['context'].reshape(-1, 1)
context = StandardScaler().fit_transform(context_)

In [None]:
X_ = np.hstack(X_anc, X_tar)

In [None]:
X_ = hstack((X_anc, X_tar)).tocsr()