In [1]:
from bs4 import BeautifulSoup
from requests import get as req_get
import os.path
import json

# Adquisición de datos

In [2]:
""" Funciones auxiliares """
def normalize(s):
    """Quita acentos, espacios y capitalizción de cadenas"""
    replacements = (("á", "a"), ("é", "e"), ("í", "i"), ("ó", "o"), ("ú", "u"),
                    ("ñ","n"), ("  "," "), (" ","-") )
    s = s.strip()
    for a, b in replacements:
        s = s.replace(a, b).replace(a.upper(), b.upper())
        s = s.lower()
    return s


def url_maker(departamento, docente):
    """dado un profesor y su departamento retorna la url del profesor en losestudiantes.co"""
    base_url = "https://losestudiantes.co/universidad-nacional/"
    url = "{}{}/profesores/{}".format(base_url, departamento,docente)
    return(url)

In [3]:
"""Obtener la lista de profesores de la facultad de artes"""
url_artes = 'http://www.facartes.unal.edu.co/fa/docentes/index.php'
html_artes = req_get(url_artes).text

arbol = BeautifulSoup(html_artes)
#print(arbol.find("option",selected=True).text)
escuelas = arbol.find_all("option")
#print(escuelas[0]['value'])
docentes = {}

for escuela in escuelas:
    docente_dep = []
    new_url = url_artes+"?escuela=" + escuela['value']
    html_artes = req_get(new_url).text
    arbol = BeautifulSoup(html_artes)
    for i in arbol.find_all("span", {"class":"name"}):
        docente_dep.append(normalize(i.text))
    docentes[normalize(arbol.find("option",selected=True).text)] = docente_dep

docentes["artes-plasticas-y-visuales"] = docentes.pop("artes-plasticas")

In [4]:
"""Obtener la lista de profesores del departamento de admin y countaduria"""

url_administracion_contaduria = "http://fce.unal.edu.co/docentesfce/index.php?escuela=1"
html_administracion_contaduria =  req_get(url_administracion_contaduria).text
arbol = BeautifulSoup(html_administracion_contaduria)
docente_dep = []
for profesor in arbol.find_all("h4",{ "style":"font-size: 16px"}):
    docente_dep.append(normalize(profesor.text))
docentes["administracion-y-contaduria-publica"] = docente_dep

In [5]:
"""Obtener la lista de profesores del departamento de economía"""

url_economia = "http://fce.unal.edu.co/docentesfce/index.php?escuela=2"
html_economia =  req_get(url_economia).text
arbol = BeautifulSoup(html_economia)
docente_dep = []
for profesor in arbol.find_all("h4",{ "style":"font-size: 16px"}):
    docente_dep.append(normalize(profesor.text))
docentes["economia"] = docente_dep


In [6]:
"""Obtener la lista de profesores de Uniandes"""
urls = []
with open("./data/url_andes.txt","r") as url_andes:
    arbol = BeautifulSoup(url_andes.read())
    for a in arbol.find_all("a",{"class":"jsx-633353764"}):
        urls.append("https://losestudiantes.co/"+a['href'])
print(len(urls))

3302


In [7]:
json.dump(docentes, open("./data/profesors.json", 'w'))

for k,v in docentes.items():
    for profesor in v:
        urls.append(url_maker(k,profesor))

with open('urls.txt', 'w') as f:
    for item in urls:
        f.write("%s\n" % item)

In [None]:
train_data =[]
if os.path.isfile('reviews.json'):
    # # Read data from file:
      train_data = json.load( open( "reviews.json" ) )
else:
    for url in urls:
        html_review = req_get(url).text
        arbol = BeautifulSoup(html_review)
        containers = arbol.find_all("li",{"class":"jsx-571610088 post "})  
        for container in containers:
    # nota = container.find("span",{"class":"jsx-571610088 numeroStats"}).text
    #nota = round(nota * 2) / 2
            review = container.find("div",{"class":"jsx-571610088 lineBreak"})
            nota = container.find("span",{"class":"jsx-571610088 numeroStats"})
            if nota is None or review is None:
                continue
            train_data.append((review.text, str(nota.text),url))

In [None]:
with open('reviews.txt', 'w') as f:
    for item in train_data:
        f.write("{} --{}\n".format(item[0], item[1]))
json.dump(train_data, open( "reviews.json", 'w', encoding='utf-8'))

# Entrenamiento de modelo

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import pandas as pd

In [None]:
df = pd.DataFrame(train_data)
print(df[df[1]=="5"].count())
print(df)

In [None]:
cv = CountVectorizer(binary=True)
cv.fit(df[0])
X = cv.transform(df[0])

x_train,x_test,y_train , y_test = train_test_split(X,df[1], train_size =  0.75)
print(y_train)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    print
    lr = LogisticRegression(solver='newton-cg',C=c,max_iter=1000)
    lr.fit(x_train, y_train)
    accuracy = accuracy_score(y_test, lr.predict(x_test))
    if accuracy >= 0.4:
    print ("Accuracy for C={}: {}".format(c, accuracy))
logit = LogisticRegression(C=1, solver='lbfgs', n_jobs=-1, random_state=7, max_iter=1000)
log_reg = logit.fit(x_train,y_train)

In [None]:
'''
from keras.models import Sequential
from keras.layers import Dense
# define the keras model
model = Sequential()
model.add(Dense(12, input_dim=1902, activation='relu'))
model.add(Dense(1902, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
# compile the keras model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit the keras model on the dataset
model.fit(X, df[1], epochs=150, batch_size=10)
'''

In [None]:
review = "es pesimo profesor, no merece estar en esa universidad"
test = [(review)]
val = cv.transform(test)
print(log_reg.predict(val))