In [10]:
import math as M
# – return the Manhattan distance between two dictionary data points from the data set.
def manhattan_distance(data_point1, data_point2):
    dmax = abs(data_point1['TMAX']) - data_point2['TMAX'])
    dmin = abs(data_point1['TMIN'] - data_point2['TMIN'])
    dprcp = abs(data_point1['PRCP'] - data_point2['PRCP'])
    return dmax + dmin + dprcp
 # – return a list of data point dictionaries read from the specified file.    
def read_dataset(filename):
    weatherData = []
    file = open(filename, "r")
    for line in file:
        line = line.strip("\n")
        datos = line.split(" ")
        day = {'DATE':datos[0], 'TMAX':datos[2], 'PRCP':datos[1], 'TMIN':datos[3], 'RAIN': datos[4] }
        weatherData.append(day)
    file.close()
    return weatherData

# – return a prediction of whether it is raining or not based on a majority vote of the list of neighbors.
def majority_vote(nearest_neighbors):
    t = 0
    for day in nearest_neighbors:
        t = (t + 1 if day['RAIN'] == "TRUE" else t + 0)
    return ("TRUE" if t >= M.ceil(float(len(nearest_neighbors))/2.0) else "FALSE")

    # – using the above functions, return the majority vote prediction for whether it's raining or not on the provided test point.
def k_nearest_neighbors(filename, test_point, k, year_interval):
    weatherData = read_dataset(filename)
    interval = calculate_interval(test_point['DATE'], year_interval)
    possibleNeighbors = []
    
    for date in weatherData:
        if is_in_interval(date["DATE"], interval):
            possibleNeighbors.append(date)
            
    with_distances = distance_comp(possibleNeighbors, test_point)
    sorted_neighbors = sorted(with_distances, key = lambda i: i['DIST'])
    print(sorted_neighbors)
    
    return majority_vote(sorted_neighbors[:k])
      
def calculate_interval(date, year_interval):
    year = int(date.split("-")[0])
    start_year = year - year_interval + 1
    interval = []
    for years in range(start_year, year + year_interval):
        interval.append(str(years))
    print("years")
    print(interval)
    return interval

def is_in_interval(date, interval):
    return True if date.split("-")[0] in interval else False

def distance_comp(neighbors, test_point): 
    for neighbor in neighbors:
        dist = manhattan_distance(neighbor, test_point)
        neighbor['DIST'] = dist
    return neighbors

weatherData = read_dataset("rain.txt")

k_nearest_neighbors('rain.txt', {'DATE': '1948-01-01', 'TMAX': 51.0, 'PRCP': 0.47, 'TMIN': 42.0}, 2, 10)

years
['1939', '1940', '1941', '1942', '1943', '1944', '1945', '1946', '1947', '1948', '1949', '1950', '1951', '1952', '1953', '1954', '1955', '1956', '1957']
[{'DATE': '1948-01-01', 'TMAX': '51', 'PRCP': '0.47', 'TMIN': '42', 'RAIN': 'TRUE', 'DIST': 0.0}, {'DATE': '1952-01-31', 'TMAX': '51', 'PRCP': '0.43', 'TMIN': '42', 'RAIN': 'TRUE', 'DIST': 0.03999999999999998}, {'DATE': '1956-12-17', 'TMAX': '51', 'PRCP': '0.42', 'TMIN': '42', 'RAIN': 'TRUE', 'DIST': 0.04999999999999999}, {'DATE': '1949-11-10', 'TMAX': '51', 'PRCP': '0.27', 'TMIN': '42', 'RAIN': 'TRUE', 'DIST': 0.19999999999999996}, {'DATE': '1954-12-23', 'TMAX': '51', 'PRCP': '0.71', 'TMIN': '42', 'RAIN': 'TRUE', 'DIST': 0.24}, {'DATE': '1953-01-15', 'TMAX': '51', 'PRCP': '0.49', 'TMIN': '41', 'RAIN': 'TRUE', 'DIST': 1.02}, {'DATE': '1955-05-30', 'TMAX': '52', 'PRCP': '0.53', 'TMIN': '42', 'RAIN': 'TRUE', 'DIST': 1.06}, {'DATE': '1948-09-22', 'TMAX': '51', 'PRCP': '0.36', 'TMIN': '43', 'RAIN': 'TRUE', 'DIST': 1.1099999999999999}

'TRUE'

>>> majority_vote([{'DATE': '2015-08-12', 'TMAX': 83.0, 'PRCP': 0.3, 'TMIN': 62.0, 'RAIN': 'TRUE'},
{'DATE': '2014-05-19', 'TMAX': 70.0, 'PRCP': 0.0, 'TMIN': 50.0, 'RAIN': 'FALSE'},
{'DATE': '2014-12-05', 'TMAX': 55.0, 'PRCP': 0.12, 'TMIN': 44.0, 'RAIN': 'TRUE'},
{'DATE': '1954-09-08', 'TMAX': 71.0, 'PRCP': 0.02, 'TMIN': 55.0, 'RAIN': 'TRUE'},
{'DATE': '2014-08-27', 'TMAX': 84.0, 'PRCP': 0.0, 'TMIN': 61.0, 'RAIN': 'FALSE'}])
=> 'TRUE'