python-data-science · JLCHUDGAR · Jan 9, 2018 · Jan 9, 2018
diff --git a/Supplies.csv b/Supplies.csv
@@ -0,0 +1,44 @@
+OrderDate,Region,Rep,Item,Units,Unit Price
+4-Jul-14,East,Richard,Pen Set,62,4.99
+12-Jul-14,East,Nick,Binder,29,1.99
+21-Jul-14,Central,Morgan,Pen Set,55,12.49
+29-Jul-14,East,Susan,Binder,81,19.99
+7-Aug-14,Central,Matthew,Pen Set,42,23.95
+15-Aug-14,East,Richard,Pencil,35,4.99
+24-Aug-14,West,James,Desk,3,275
+1-Sep-14,Central,Smith,Desk,2,125
+10-Sep-14,Central,Bill,Pencil,7,1.29
+18-Sep-14,East,Richard,Pen Set,16,15.99
+27-Sep-14,West,James,Pen,76,1.99
+5-Oct-14,Central,Morgan,Binder,28,8.99
+14-Oct-14,West,Thomas,Binder,57,19.99
+22-Oct-14,East,Richard,Pen,64,8.99
+31-Oct-14,Central,Rachel,Pencil,14,1.29
+8-Nov-14,East,Susan,Pen,15,19.99
+17-Nov-14,Central,Alex,Binder,11,4.99
+25-Nov-14,Central,Matthew,Pen Set,96,4.99
+4-Dec-14,Central,Alex,Binder,94,19.99
+12-Dec-14,Central,Smith,Pencil,67,1.29
+21-Dec-14,Central,Rachel,Binder,28,4.99
+29-Dec-14,East,Susan,Pen Set,74,15.99
+6-Jan-15,East,Richard,Pencil,95,1.99
+15-Jan-15,Central,Bill,Binder,46,8.99
+23-Jan-15,Central,Matthew,Binder,50,19.99
+1-Feb-15,Central,Smith,Binder,87,15
+9-Feb-15,Central,Alex,Pencil,36,4.99
+18-Feb-15,East,Richard,Binder,4,4.99
+26-Feb-15,Central,Bill,Pen,27,19.99
+7-Mar-15,West,James,Binder,7,19.99
+15-Mar-15,West,James,Pencil,56,2.99
+24-Mar-15,Central,Alex,Pen Set,50,4.99
+1-Apr-15,East,Richard,Binder,60,4.99
+10-Apr-15,Central,Rachel,Pencil,66,1.99
+18-Apr-15,Central,Rachel,Pencil,75,1.99
+27-Apr-15,East,Nick,Pen,96,4.99
+5-May-15,Central,Alex,Pencil,90,4.99
+14-May-15,Central,Bill,Pencil,53,1.29
+22-May-15,West,Thomas,Pencil,32,1.99
+31-May-15,Central,Bill,Binder,80,8.99
+8-Jun-15,East,Richard,Binder,60,8.99
+17-Jun-15,Central,Matthew,Desk,5,125
+25-Jun-15,Central,Morgan,Pencil,90,4.99
diff --git a/__pycache__/exercise.cpython-36.pyc b/__pycache__/exercise.cpython-36.pyc
diff --git a/__pycache__/morestats.cpython-36.pyc b/__pycache__/morestats.cpython-36.pyc
diff --git a/baseball.csv b/baseball.csv
diff --git a/baseball_analysis.py b/baseball_analysis.py
@@ -0,0 +1,19 @@
+import pandas as import pd
+import morestats as m
+
+df = pd.read_csv('baseball.csv')
+
+#find average height, weight, age for all players using morestats
+
+avg_height  = m.mean(df.Height)
+avg_weight  = m.mean(df.Weight)
+avg_age  = m.mean(df.Age)
+
+#group by team name and show mean height, weight, ages
+teams = df.groupby(['Team']).mean()
+
+#find aggregate stats for Arizona
+arizona = teams.loc[ARZ]
+
+#which team has the greatest average Height
+greatest = teams.idxmax()['Height']
diff --git a/exercise.py b/exercise.py
@@ -1,32 +1,55 @@
 # problem 1
 # ------------------------------------------------------------------- #
-people = ['Bob Smith', 'Ken Jones', 'Alex Bradino']
+"""
+python -m doctest -v exercise.py
+
+"""
 
+people = ['Bob Smith', 'Ken Jones', 'Alex Bradino']
 
 def sort_by_last_name(people, order):
-    # return full names sorted by last name in either ascending or descending order
+    """
+    sorts list alphabetically
+    >>> sort_by_last_name(people,False)
+    ['Alex Bradino', 'Ken Jones', 'Bob Smith']
+    >>> sort_by_last_name(people,True)
+    ['Bob Smith', 'Ken Jones', 'Alex Bradino']
+    """
+    # return full names sorted by last name in ascending order
+    # ['Alex Bradino', 'Ken Jones', 'Bob Smith']
     # add doctests make sure it passes
-    pass
+    return sorted(people, key=lambda person: person.split()[-1], reverse=order)
 
 
 # problem 2
 # ------------------------------------------------------------------- #
 names = ['James', 'Susan', 'Maggie']
 ages = [4, 9, 12]
 
-
 def create_dictionary_from_lists(names, ages):
+    """
+    creates a new dictionary from lists names and ages
+    >>> create_dictionary_from_lists(names, ages)
+    {'James': 4, 'Maggie': 12, 'Susan': 9}
+    """
     # {'James':4, 'Susan':9, 'Maggie':12}
     # add doctests make sure it passes
-    pass
-
+    mydict = {}
+    for i in range(len(names)):
+        mydict[names[i]] = ages[i]
+    return mydict
 
 # problem 3
 # ------------------------------------------------------------------- #
 numbers = [5, 6, 7, 8, 9, 10, 11, 12]
 
 
 def square_even_values_and_sum_under_10(numbers):
+    """
+    the squares for only the even values in list numbers that are less than 10
+    >>> square_even_values_and_sum_under_10(numbers)
+    100
+    """
     # 6^2 + 8^2]
     # add doctests make sure it passes
-    pass
+    return sum([n**2 for n in numbers if (n<10 and n%2==0)])
diff --git a/morestats.py b/morestats.py
@@ -0,0 +1,78 @@
+print('Hello World!')
+"""
+python -m doctest -v morestats.py
+
+"""
+def add(num1, num2):
+    return num1 + num2
+
+print(add(2,3))
+
+
+#compute the volume of a rectangle
+def vrectangle(length, width, height):
+    return length * width * height
+
+#compute the mean of a bunch of numbers
+def mean(numbers):
+    return sum(numbers) /len(numbers)
+
+#compute the median of a bunch of numbers
+def median(numbers):
+    """
+    Computes the median of a list of numbers
+    argument: list of numbers
+    return the median
+    >>> median([2,1,6])
+    2
+    >>> median([3,5,4,9])
+    4.5
+    """
+    numbers = sorted(numbers)
+    middle = len(numbers) // 2
+    if len(numbers) % 2 == 0:
+        # even list
+        return sum(numbers[middle - 1:middle + 1]) / 2
+    else:
+        # odd list
+        return numbers[middle]
+
+from collections import defaultdict
+def mode(numbers):
+    """
+    finds the most frequent value of a list
+    >>> mode([1,1,1,1,1,1,4,5,6])
+    1
+    >>> mode([1,2,2,2,3,3,4])
+    2
+    """
+    d = defaultdict(int)
+    for num in numbers:
+        d[num] += 1
+    return sorted(d, key=lambda k:d[k]) [-1]
+
+#varience tells us about the spread of the data
+#the square root of varience is the standard deviation
+#1 standard deviation is also called 1 sigma
+#compute varience
+
+numbers = [1, 2, 3, 4, 5, 6, 7]
+def variance (numbers, ddof):
+    """
+    determines the variance of a set of numbers
+    >>> variance(numbers, 0)
+    1.3720238095238095
+
+    """
+    return sum([(num - mean(numbers)) ** 2 for num in numbers]) / (len(numbers) - ddof)
+
+
+def stdev (numbers, ddof):
+    """
+    finds the standard deviation for a population or a sample
+    >>> stdev(numnbers, 0)
+    1
+    >>> stdev(numnbers, 1)
+    2
+    """
+    return variance(numbers, ddof) ** 0.5
diff --git a/supplies_analysis.py b/supplies_analysis.py
@@ -0,0 +1,25 @@
+import pandas as pd
+df = pd.read_csv('supplies.csv')
+
+#add a new column called total = units * unitsprice
+df['Total'] = df['Units'] * df['Unit Price']
+
+#show the mean, sum for each rep per region
+regions = df.groupby(['Region','Rep'])['Total'].agg(['mean', 'sum', 'count'])
+
+#which are the largest?
+largestthree = df.groupby(['Region','Rep'])['Total'].agg(['mean', 'sum', 'count']).nlargest(3, 'mean')
+
+regions = df.groupby(['Region'])['Total'].agg(['sum'])
+reps = df.groupby(['Region'])['Rep'].unique()
+
+
+#convert series into dataframe
+rps = reps.to_frame()
+reps = rps.reset_index()
+regions = regions.reset_index()
+merged = pd.merge(reps, regions, on='Region', how='inner').set_index('Region')
+
+
+merged['count'] = merged.apply(lambda row: len(row['Rep']), axis=1)
+merged['normalized'] = merged['sum'] / merged['count']