In [80]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
import os
import gensim
from gensim.models import Word2Vec
pd.set_option("display.max_column", None)
pd.set_option("display.max_colwidth", None)

import re

In [81]:
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.model_selection import train_test_split

In [82]:
survey_file = pd.read_csv("Spring PS Responses.csv")

In [83]:
NO_SUMMER_PLAN_THEMES = {
    "Work/Internship/Studying abroad/Research" : 1,
    "Want to be at home/with family" : 2, 
    "Graduating/Graduated" : 3, 
    "No housing/Not in Berkeley/No transportation/Expensive" : 4,
    "Not sure/Other" : 5
}

In [122]:
responses = survey_file.loc[:, ['SUMMER_NOENR_6_TEXT', 'UngradGradCd']].dropna()
responses = pd.concat([responses, pd.DataFrame(range(len(responses)))], axis = 1)
responses.to_excel('~$responses.xlsx')
responses.head(5)

Unnamed: 0,SUMMER_NOENR_6_TEXT,UngradGradCd,0
0,You guy's it's so crazy expensive are you kidding?,U,0
1,would rather fulfill breadths at another institution,U,1
2,Would rather do research or internship since those are not offer during the semester,U,2
3,Would prefer in person class instead of web classes,U,3
4,Working/Internship,G,4


In [123]:
training_data = pd.read_excel('responses_labeled.xlsx')
training_data = training_data.sample(frac=0.15)
labeled_data = training_data.to_excel('labeled_data.xlsx')

In [112]:
training_data.head(5)

Unnamed: 0.1,Unnamed: 0,SUMMER_NOENR_6_TEXT,UngradGradCd,Label
236,236,Graduating in May,G,3
182,182,I don't want to be away from my family in the summer,U,2
51,51,Too busy with internships,G,1
1,1,would rather fulfill breadths at another institution,U,5
246,246,Graduating,U,3


In [113]:
labeled_data = pd.read_excel('labeled_data.xlsx').loc[:,['Unnamed: 0', 'SUMMER_NOENR_6_TEXT', 'UngradGradCd', 'Label']]
unlabeled_data = responses.loc[responses[0].apply(lambda x: x not in list(labeled_data['Unnamed: 0']))]
labeled_data.head(5)

Unnamed: 0.1,Unnamed: 0,SUMMER_NOENR_6_TEXT,UngradGradCd,Label
0,236,Graduating in May,G,3
1,182,I don't want to be away from my family in the summer,U,2
2,51,Too busy with internships,G,1
3,1,would rather fulfill breadths at another institution,U,5
4,246,Graduating,U,3


In [114]:
labeled_data['SUMMER_NOENR_6_TEXT'] = labeled_data['SUMMER_NOENR_6_TEXT'].apply(lambda x: gensim.utils.simple_preprocess(x))
labeled_data = labeled_data.reset_index(drop=True)
labeled_data

unlabeled_data['SUMMER_NOENR_6_TEXT'] = unlabeled_data['SUMMER_NOENR_6_TEXT'].apply(lambda x: gensim.utils.simple_preprocess(x))
unlabeled_data = unlabeled_data.reset_index(drop=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unlabeled_data['SUMMER_NOENR_6_TEXT'] = unlabeled_data['SUMMER_NOENR_6_TEXT'].apply(lambda x: gensim.utils.simple_preprocess(x))


In [115]:
X_train, X_test, y_train, y_test = train_test_split(labeled_data["SUMMER_NOENR_6_TEXT"], 
                                                    labeled_data["Label"], test_size=0.20)



In [116]:
w2v_model = gensim.models.Word2Vec(X_train,
                                   vector_size=100,
                                   window=5,
                                   min_count=2)

In [117]:
words = w2v_model.wv.index_to_key

In [118]:
X_train_vect_avg = []
for v in [np.array([w2v_model.wv[i] for i in ls if i in words], dtype=object)
                         for ls in X_train]:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(100, dtype=float))
        
X_test_vect_avg = []
for v in [np.array([w2v_model.wv[i] for i in ls if i in words], dtype=object)
                         for ls in X_test]:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(100, dtype=float))

In [119]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect_avg, y_train.values.ravel())

In [120]:
y_pred_rf = rf_model.predict(X_test_vect_avg)

In [121]:
y_pred = y_pred_rf
from sklearn.metrics import precision_score, recall_score
precision = precision_score(y_test, y_pred, average='micro')
recall = recall_score(y_test, y_pred, average='micro')
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

Precision: 0.444 / Recall: 0.444 / Accuracy: 0.444
