# Project 4 - Part 3

In [65]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.externals import joblib
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import LabelEncoder
import gensim 
from gensim import corpora
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
import psycopg2 as pg
from sqlalchemy import create_engine
import pandas as pd
import re
from spacy.en import English
nlp = English()
from spacy.en import STOP_WORDS
import wikipedia
import psycopg2 as pg
from sqlalchemy import create_engine
import pandas as pd

#### DB setup SQLAlchemy connection

Postgres DB & Create tables

CREATE TABLE category1_tbl (categoryid INTEGER PRIMARY KEY, title TEXT, category TEXT);

CREATE TABLE page1_tbl (pageid INTEGER PRIMARY KEY, title TEXT, content TEXT, category TEXT);

CREATE TABLE page1_category1_tbl (pageid_categoryid INTEGER, pageid INTEGER REFERENCES page_tbl(pageid), categoryid INTEGER REFERENCES category_tbl(categoryid));

In [66]:
connection = create_engine("postgresql://postgres@postgres/postgres")

In [67]:
page_df=pd.read_sql_query('select * from page1_tbl',con=connection)

In [68]:
page_df.head()

Unnamed: 0,pageid,title,category,content
0,54972729.0,User:CustIntelMngt/sandbox/Customer Intelligen...,machine learning,\n= Customer Intelligence Management =\n\n\n==...
1,43385931.0,Data exploration,machine learning,Data exploration is an approach similar to ini...
2,49082762.0,List of datasets for machine learning research,machine learning,These datasets are used for machine learning r...
3,233488.0,Machine learning,machine learning,Machine learning is the subfield of computer s...
4,53587467.0,Outline of machine learning,machine learning,The following outline is provided as an overvi...


#### Clean Content

In [69]:
def cleaner(text):
    text = re.sub('&#39;','',text).lower()
    text = re.sub('<br />','',text)
    text = re.sub('<.*>.*</.*>','', text)
    text = re.sub('\\ufeff', '', text)
    text = re.sub('[\d]','',text)
    text = re.sub('[^a-z ]','',text)
    text = re.sub('\d+\.\d*','',text)
    text = ' '.join(i.lemma_ for i in nlp(text)
                    if i.orth_ not in STOP_WORDS)
    text = ' '.join(text.split())
    return text

In [70]:
page_df['clean_content'] = page_df['content'].apply(cleaner)

In [71]:
X = page_df['clean_content']
y = page_df['category']

In [72]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = (42))

In [73]:
tfidf_vectorizer_rf = TfidfVectorizer(min_df = 5, max_df = .95, ngram_range=(1,2), stop_words = "english")

In [74]:
document_term_matrix_rf = tfidf_vectorizer_rf.fit_transform(X_train)

In [75]:
document_term_matrix_rf.shape

(3907, 44837)

In [76]:
n_components = 400
SVD = TruncatedSVD(n_components)
component_names = ["component_"+str(i+1) for i in range(n_components)]

In [77]:
latent_semantic_analysis_rf = SVD.fit_transform(document_term_matrix_rf)

In [78]:
clf = RandomForestClassifier(n_jobs=-1)
clf.fit(latent_semantic_analysis_rf, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=-1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [79]:
clf.score(latent_semantic_analysis_rf, y_train)

0.99590478628103407

In [80]:
clf.score(SVD.transform(tfidf_vectorizer_rf.transform(X_test)), y_test)

0.96085955487336916

### Random Forest with Grid Search & Pipeline

In [81]:
rf_pipe = Pipeline([
    ('v1', TfidfVectorizer()),
    ('svd', TruncatedSVD(n_components=400)),
    ('clf', RandomForestClassifier())
])

In [82]:
rf_params = {
    'v1__min_df':[2],
    'v1__max_df':[.95],
    'v1__ngram_range':[(1,2)],
    'clf__n_estimators':[200],
    'clf__max_features':['auto']
}

In [83]:
rf_gs = GridSearchCV(rf_pipe, param_grid=rf_params, cv=5)

In [84]:
rf_gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('v1', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
    ...imators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'v1__min_df': [2], 'v1__max_df': [0.95], 'v1__ngram_range': [(1, 2)], 'clf__n_estimators': [200], 'clf__max_features': ['auto']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [87]:
rf_gs.score(X_train, y_train)

0.99872024571282314

In [88]:
rf_gs.score(X_test, y_test)

0.9708365310821182

In [89]:
rf_gs.best_params_

{'clf__max_features': 'auto',
 'clf__n_estimators': 200,
 'v1__max_df': 0.95,
 'v1__min_df': 2,
 'v1__ngram_range': (1, 2)}

In [90]:
y_pred_rfgs = rf_gs.predict_proba(y_test)

In [112]:
X_new = 'artificial intelligence'

In [115]:
#page_df[page_df['content'].str.contains('artificial intelligence')]

In [116]:
page_df[page_df['pageid'] == 2934910.0]

Unnamed: 0,pageid,title,category,content,clean_content
35,2934910.0,Cognitive robotics,machine learning,Cognitive robotics is concerned with endowing ...,cognitive robotics concern endow robot intelli...


In [117]:
content_ml = []
page = wikipedia.WikipediaPage(pageid = 2934910.0)
content = page.content
title = page.title
content_ml.append([2934910.0, title, content])

In [118]:
X_new = pd.DataFrame(content_ml, columns=('pageid', 'title', 'content'))

In [119]:
X_new

Unnamed: 0,pageid,title,content
0,2934910.0,Cognitive robotics,Cognitive robotics is concerned with endowing ...


In [120]:
rf_gs.predict(X_new['content'])

array(['machine learning'], dtype=object)