### Introduction:
#### I am new to machine learning and kaggle, this is one of my first formal competitions and I will very much appreciate any kind of help, guidance, feedback and/or support in the comments so that I can learn more.

### Imports

In [None]:
import warnings
warnings.simplefilter('ignore')

import sys
sys.setrecursionlimit(1000000)

import pickle

import numpy as np
import pandas as pd
pd.set_option('max_columns', None)
pd.set_option('max_rows', 100)
from tqdm.notebook import tqdm

from collections import Counter
from math import sin, cos, sqrt, atan2, radians
from scipy import spatial

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
import joblib

In [None]:
train = pd.read_csv("../input/foursquare-location-matching/train.csv")
pairs = pd.read_csv("../input/foursquare-location-matching/pairs.csv")

### Selecting features

In [None]:
pairs = pairs.drop(['address_1','city_1','state_1','zip_1','url_1','phone_1','address_2','city_2','state_2','zip_2','url_2','phone_2'],axis=1)
pairs = pairs.fillna("__nan__")
pairs.head()

### HashVectorizing names and categories to feed them into a ML model

In [None]:
hv_names = HashingVectorizer(n_features = 20)
hv_cats = HashingVectorizer(n_features = 30)

### Opening up the hashed arrays and adding them to the dataframe

In [None]:
for i,r in tqdm(pairs.iterrows(),total = pairs.shape[0]):
    tk_name_1 = hv_names.transform([r['name_1']]).toarray()[0]
    for j in range(len(tk_name_1)):
        pairs.at[i,'tk_name_1_{}'.format(j)]=tk_name_1[j]
    tk_name_2 = hv_names.transform([r['name_2']]).toarray()[0]
    for j in range(len(tk_name_2)):
        pairs.at[i,'tk_name_2_{}'.format(j)]=tk_name_2[j]
    tk_categories_1 = hv_cats.transform([r['categories_1']]).toarray()[0]
    for j in range(len(tk_categories_1)):
        pairs.at[i,'tk_categories_1_{}'.format(j)]=tk_categories_1[j]
    tk_categories_2 = hv_cats.transform([r['categories_2']]).toarray()[0]
    for j in range(len(tk_categories_2)):
        pairs.at[i,'tk_categories_2_{}'.format(j)]=tk_categories_2[j]
pairs.head()

### Encoding country pairs, to handle any noise, considered every pair of the Alphabet

In [None]:
pairs['match'] = pairs['match'].astype(int)
le = LabelEncoder()
country_codes = []
letters = list("QWERTYUIOPASDFGHJKLZXCVBNM")
print(letters)
for i in letters:
    for j in letters:
        country_codes.append(i+j)
print(len(country_codes))
country_codes.append('__nan__')
le.fit(country_codes)
pairs['country_1'] = le.transform(pairs['country_1'].tolist())
pairs['country_2'] = le.transform(pairs['country_2'].tolist())
pairs.head()

### dropping non-feature columns for training

In [None]:
pair_training = pairs.drop(['id_1','name_1','categories_1','id_2','name_2','categories_2'], axis = 1)
pair_training.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(pair_training.drop(['match'],axis=1),pair_training['match'],test_size=0.25)

### Using RandomForestClassifier model

In [None]:
model_rfs = RandomForestClassifier(n_estimators = 100, verbose = 1, n_jobs = 1)
model_rfs.fit(X_train,y_train)
print(model_rfs.score(X_test,y_test))
# pickle.dump(model_rfs,open("FourSquare_RandomForestClassification.svc",'wb'))

In [None]:
joblib.dump(model_rfs,"RandomForestClassifer.jbl")

In [None]:
print(model_rfs.score(X_test,y_test))
# pickle.dump(model_rfs,open("FourSquare_RandomForestClassification.svc",'wb'))